Package Loading and Data Importing

Load Packages and Custom Functions

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

if (!require("remotes", quietly = TRUE))
    install.packages("remotes")

if (!require("pacman", quietly = TRUE))
    install.packages("pacman")

if (!require("gtsummary", quietly = TRUE))
  remotes::install_github("ddsjoberg/gtsummary")

if (!require("mixOmics", quietly = TRUE))
  BiocManager::install("mixOmicsTeam/mixOmics")

if (!require("yingtools2", quietly = TRUE))
  remotes::install_github("ying14/yingtools2")

if (!require("ggpirate", quietly = TRUE))
  remotes::install_github("mikabr/ggpirate")

if (!require("paletteer", quietly = TRUE))
  remotes::install_github("EmilHvitfeldt/paletteer")

if (!require("microbiomeMarker", quietly = TRUE))
  BiocManager::install("microbiomeMarker")

if (!require("magick", quietly = TRUE))
  install.packages("magick")

if (!require("mltest", quietly = TRUE))
  install.packages("mltest")

library(pacman)
p_load(
  factoextra,               # Dimension reduction
  FactoMineR,               # Dimension reduction
  stabs,                    # Stability variable selection
  mboost,                   # Gradient boosting for model building
  ggrepel,                  # Visualization, repels labels on plots
  bestglm,                  # Logistic regression
  knitr,                    # To change R markdown PDF options
  caret,                    # To calculate model paramters
  cutpointr,                # Calculate cutpoints
  RPostgreSQL,              # Connect to PostgreSQL server
  phyloseq,                 # Phyloseq data wrangling
  # microbiomeMarker,         # LEfSe analysis
  ComplexHeatmap,           # Heatmap generator
  paletteer,                # Extensive color palette
  tidyverse,                # Data wrangling and visualization
  circlize,                 # Color ramp builder
  rstatix,                  # Tidyverse statistics
  EnhancedVolcano,          # Volcano plot analysis
  umap,                     # Uniform Manifold Approximation Projection
  ggpubr,                   # Simple ggplots with stats
  ggpirate,                 # ggplot version of Pirate plot
  mixOmics,                 # PLS-DA
  conflicted,               # Forces conflicted packages to be used by preferred package
  gridExtra,                # Arrange multiple grobs
  reticulate,               # Run python in R
  tableone,                 # Additional clinical tables support
  lubridate,                # Manipulate date objects
  yingtools2,               # Custom plotting functions
  cowplot,                  # Combine plots together
  epiR,                     # Obtain model performance measures
  pROC,                     # Produce ROC curves
  plotrix,                  # Add table to base R plot
  glmnet,                   # LASSO regression
  forcats,                  # Factor reordering
  ggpmisc,                  # Miscellaneous ggplot helper functions
  doParallel,               # Parallelize functions (for optimizing cutpoints)
  PRISMAstatement,          # Flow table generator
  DiagrammeRsvg,            # Flow table visualizer
  rsvg,                     # Flow table visualizer
  scales,                   # Scale functions for visualizations
  devtools,                 # To load packages from GitHub
  # datscience,               # Bibliography manager
  formatR,                  # Markdown code formatter
  install = FALSE
)

library("gtsummary")  # Clinical tables, pacman had trouble loading this in, so had to go the manual route

{conflict_prefer("select", "dplyr")
conflict_prefer("mutate", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("rename", "dplyr")
conflict_prefer("slice", "dplyr")
conflict_prefer("between", "dplyr")
conflict_prefer("annotate", "ggplot2")}

devtools::source_url("https://github.com/yingeddi2008/DFIutility/blob/master/getRdpPal.R?raw=TRUE")

# ggplot theme shortcuts
et <- element_text
eb <- element_blank
er <- element_rect
el <- element_line

opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)

`%!in%` <- negate(`%in%`)

# Running python in R
## User must set path to their python library
reticulate::use_python("/usr/bin/python3")

# Customized lefse function
source("./Data/run_lefse2.R")

Import Data

Load in all Metagenomic, Metabolomic and Clinical Data

# 1) Load R image load('./Data/LT_Modeling.RData')

# OR #

# 2) Individually Load R Objects

# Sample lookup
sample_lookup <- readRDS("./Data/sample_lookup.rds")

# First samples from all patients
first_samps <- readRDS("./Data/first_samps_anon.rds")

# Simplified dataframe containing infection information and
# relative abundance of targeted taxa
peri_matrix_all <- readRDS("./Data/peri_matrix_clin_all.rds")

# Distinct detailed infection data
peri_criteria_best <- readRDS("./Data/peri_criteria_best_anon.rds")

# All detailed infection data
peri_criteria_all <- readRDS("./Data/peri_criteria_all_anon.rds")

# NCBI taxonomy lookup
tax_lookup <- readRDS("./Data/tax_lookup.rds")

# Complete metaphlan dataframe for all patients and healthy
# donors
metaphlan_df <- readRDS("./Data/metaphlan_anon.rds")

# Metaphlan dataframe of patient samples
metaphlan_peri_anon <- readRDS("./Data/metaphlan_peri_anon.rds")

# Custom color palette
metaphlan_pal <- getRdpPal(metaphlan_df)

# Qualitative metabolomics
metab_qual_anon <- readRDS("./Data/metab_qual_anonym.rds")

# Quantitative metabolomics
metab_quant_anon <- readRDS("./Data/metab_quant_anonym.rds")

# Antibiotics data
abx <- readRDS("./Data/abx_anon.rds")

# Demographics data
demo <- readRDS("./Data/demo_anon.rds")

# Bile acid gene data
ba_genes <- readRDS("./Data/bile_acid_genes.rds")

Figures

Optimal Cutpoint Analysis: Relative Abundance of Enterococcus and Enterobacerales

# Custom function to avoid any errors during map
safe_cutpointr <- possibly(.f = cutpointr, otherwise = "Error")

# Calculate the number of cores
no_cores <- detectCores() - 2

# create the cluster for caret to use
cl <- makePSOCKcluster(no_cores)
registerDoParallel(cl)

set.seed(123456)
test_abundance <- peri_matrix_all %>%
    mutate(sampleID = as.factor(sampleID)) %>%
    select(starts_with(c("entero", "esch", "klebs", "citro",
        "rahn", "proteus")), -ends_with("abs_abundance")) %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(enterococcus_infection = ifelse(`Enterococcus faecium` +
        `Enterococcus faecalis` + `Enterococcus avium` >= 1,
        1, 0), enterobacterales_infection = ifelse(`Escherichia coli` +
        `Klebsiella pneumoniae` + `Citrobacter freundii` + `Proteus mirabilis` >=
        1, 1, 0)) %>%
    select(-c(`Enterococcus faecium`, `Enterococcus faecalis`,
        `Enterococcus avium`, `Escherichia coli`, `Klebsiella pneumoniae`,
        `Citrobacter freundii`, `Proteus mirabilis`)) %>%
    pivot_longer(-c(enterococcus_infection, enterobacterales_infection),
        names_to = "variable", values_to = "value") %>%
    pivot_longer(-c(variable, value), names_to = "infection_type",
        values_to = "infection") %>%
    mutate(variable = paste0("Input: ", variable, "\nPredict: ",
        infection_type)) %>%
    group_by(infection_type, variable) %>%
    group_map(~safe_cutpointr(., value, infection, variable,
        method = maximize_metric, metric = youden, pos_class = 1,
        boot_runs = 100, allowParallel = TRUE, na.rm = T), .keep = TRUE)

# to get cutpoint object
test_abundance[4][[1]]  # ecoc rel abd, ecoc infection
## # A tibble: 1 × 18
##   subgroup                                                             direction
##   <chr>                                                                <chr>    
## 1 "Input: enterococcus_rel_abundance\nPredict: enterococcus_infection" >=       
##   optimal_cutpoint method            youden      acc sensitivity specificity
##              <dbl> <chr>              <dbl>    <dbl>       <dbl>       <dbl>
## 1         0.199356 maximize_metric 0.577090 0.723214    0.882353    0.694737
##        AUC pos_class neg_class prevalence outcome   predictor grouping
##      <dbl>     <dbl>     <dbl>      <dbl> <chr>     <chr>     <chr>   
## 1 0.829721         1         0   0.151786 infection value     variable
##   data               roc_curve            boot               
##   <list>             <list>               <list>             
## 1 <tibble [112 × 2]> <rc_ctpnt [82 × 10]> <tibble [100 × 23]>
test_abundance[1][[1]]  # ebac rel abd, ebacc infection
## # A tibble: 1 × 18
##   subgroup                                                                    
##   <chr>                                                                       
## 1 "Input: enterobacterales_rel_abundance\nPredict: enterobacterales_infection"
##   direction optimal_cutpoint method            youden    acc sensitivity
##   <chr>                <dbl> <chr>              <dbl>  <dbl>       <dbl>
## 1 >=               0.0247691 maximize_metric 0.798077 0.8125           1
##   specificity      AUC pos_class neg_class prevalence outcome   predictor
##         <dbl>    <dbl>     <dbl>     <dbl>      <dbl> <chr>     <chr>    
## 1    0.798077 0.905048         1         0  0.0714286 infection value    
##   grouping data               roc_curve            boot               
##   <chr>    <list>             <list>               <list>             
## 1 variable <tibble [112 × 2]> <rc_ctpnt [59 × 10]> <tibble [100 × 23]>
test_abundance_unnest <- test_abundance %>%
    map_df(as_tibble)

# obtain threshold cutpoints for relative abundance
optimal_cutpoint_rel <- test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    filter(grepl("rel", subgroup)) %>%
    select(subgroup, optimal_cutpoint)

Figure 5B: Enterococcus Infection and ROC

# Expansions and Infections Stats
expan_infx_stats <- peri_matrix_all %>%
    left_join(peri_matrix_all %>%
        select(sampleID, ends_with("infection")) %>%
        ungroup()) %>%
    mutate(ecoc_infx = enterococcus_infection, ecoc_infx = ifelse(ecoc_infx >=
        1, 1, 0), ebac_infx = enterobacterales_infection, ebac_infx = ifelse(ebac_infx >=
        1, 1, 0)) %>%
    select(enterococcus_rel_abundance, enterobacterales_rel_abundance,
        ecoc_infx, ebac_infx) %>%
    summarise(ecoc_expan_above_cutpoint = sum(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2]), ecoc_infx_ecoc_below_cutpoint = sum(ecoc_infx ==
        1 & enterococcus_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ecoc_infx_ecoc_above_cutpoint = sum(ecoc_infx ==
        1 & enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ecoc_noinfx_ecoc_below_cutpoint = sum(ecoc_infx ==
        0 & enterococcus_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ecoc_noinfx_ecoc_above_cutpoint = sum(ecoc_infx ==
        0 & enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ebac_expan_above_cutpoint = sum(enterobacterales_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[1]), ebac_infx_ebac_below_cutpoint = sum(ebac_infx ==
        1 & enterobacterales_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T), ebac_infx_ebac_above_cutpoint = sum(ebac_infx ==
        1 & enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T), ebac_noinfx_ebac_below_cutpoint = sum(ebac_infx ==
        0 & enterobacterales_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T), ebac_noinfx_ebac_above_cutpoint = sum(ebac_infx ==
        0 & enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T)) %>%
    rownames_to_column(var = "rowname") %>%
    pivot_longer(-rowname, names_to = "metric", values_to = "count") %>%
    mutate(percent = round((count/107) * 100, 2)) %>%
    select(-rowname)

ecoc_infx_confusion <- expan_infx_stats %>%
    filter(metric %in% c("ecoc_infx_ecoc_below_cutpoint", "ecoc_infx_ecoc_above_cutpoint",
        "ecoc_noinfx_ecoc_below_cutpoint", "ecoc_noinfx_ecoc_above_cutpoint")) %>%
    select(metric, count) %>%
    pivot_wider(names_from = metric, values_from = count)

ecoc_infx_confusion_cnfs <- data.frame(Enterococcus = c("Infection",
    "No Infection"), Expansion = c(ecoc_infx_confusion$ecoc_infx_ecoc_above_cutpoint,
    ecoc_infx_confusion$ecoc_noinfx_ecoc_above_cutpoint), `No Expansion` = c(ecoc_infx_confusion$ecoc_infx_ecoc_below_cutpoint,
    ecoc_infx_confusion$ecoc_noinfx_ecoc_below_cutpoint))

ecoc_infx_confusion_cnfs <- ecoc_infx_confusion_cnfs %>%
    mutate(sensitivity = round(ecoc_infx_confusion_cnfs[1, 2]/(ecoc_infx_confusion_cnfs[1,
        2] + ecoc_infx_confusion_cnfs[1, 3]), 3), specificity = round(ecoc_infx_confusion_cnfs[2,
        3]/(ecoc_infx_confusion_cnfs[2, 3] + ecoc_infx_confusion_cnfs[2,
        2]), 3), odds_ratio = round((ecoc_infx_confusion_cnfs[1,
        2]/ecoc_infx_confusion_cnfs[1, 3])/(ecoc_infx_confusion_cnfs[2,
        2]/ecoc_infx_confusion_cnfs[2, 3]), 3))

fig_5b <- test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    ungroup() %>%
    unnest(roc_curve) %>%
    arrange(desc(AUC)) %>%
    select(-boot) %>%
    mutate(auc_label = paste0("AUC = ", formatC(round(AUC, 3),
        digits = 2, format = "f")), auc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
        ", ", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
            AUC, alpha = 0.05)[3][2, ], 2)), digits = 2, format = "f"),
        "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
            ", ", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
                AUC, alpha = 0.05)[3][2, ], 2)), digits = 2,
                format = "f"), "]")), acc_label = paste0("Accuracy = ",
        formatC(round(acc, 3) * 100, digits = 0, format = "f"),
        "%"), acc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0, format = "f"),
        "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0, format = "f"),
        "%"), "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
            format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
            format = "f"), "%"), "]")), sens_label = paste0("Sensitivity = ",
        formatC(round(sensitivity, 3) * 100, digits = 0, format = "f"),
        "%"), sens_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]")), spec_label = paste0("Specificity = ",
        formatC(round(specificity, 3) * 100, digits = 0, format = "f"),
        "%"), spec_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"))) %>%
    filter(grepl(pattern = "rel_abundance", x = Input)) %>%
    mutate(subgroup = str_to_title(subgroup), subgroup = gsub(pattern = "_",
        replacement = " ", x = subgroup), subgroup = gsub(pattern = "rel abundance",
        replacement = "Relative Abundance (%)", x = subgroup),
        subgroup2 = ifelse(grepl(x = subgroup, pattern = "enterococcus",
            ignore.case = TRUE), "Enterococcus", "Enterobacterales"),
        subgroup2 = factor(subgroup2, levels = c("Enterococcus",
            "Enterobacterales")), odds_label = paste0("OR = ",
            ecoc_infx_confusion_cnfs$odds_ratio[1])) %>%
    filter(subgroup2 == "Enterococcus") %>%
    ggplot(aes(x = fpr, y = tpr, color = subgroup2)) + geom_line(size = 1.2) +
    geom_text(aes(label = auc_label, x = 0.5, y = 0.16), show.legend = F,
        hjust = 0) + geom_text(aes(label = acc_label, x = 0.5,
    y = 0.12), show.legend = F, hjust = 0) + geom_text(aes(label = spec_label,
    x = 0.5, y = 0.08), show.legend = F, hjust = 0) + geom_text(aes(label = sens_label,
    x = 0.5, y = 0.04), show.legend = F, hjust = 0) + geom_text(aes(label = paste0("Cutpoint = ",
    round((optimal_cutpoint), digits = 3) * 100, "%"), x = 0.5,
    y = 0.2), hjust = 0, show.legend = F) + geom_text(aes(label = odds_label,
    x = 0.5, y = 0), show.legend = F, hjust = 0) + theme_bw() +
    theme(axis.text = et(color = "black", size = 12), axis.title = et(color = "black",
        size = 14), legend.text = et(size = 12), legend.title = et(size = 14),
        legend.spacing.y = unit(0.5, "cm"), legend.position = "none",
        panel.grid = eb(), strip.text = et(size = 14, color = "#2dc46b",
            face = "bold"), strip.background = eb(), panel.border = eb(),
        panel.spacing.y = unit(20, "mm"), axis.line.x = el(color = "black")) +
    geom_vline(xintercept = -0.05) + geom_hline(yintercept = -0.05) +
    geom_table_npc(data = ecoc_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterococcus") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.)), label = list(ecoc_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterococcus") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.))), npcx = 0.25, npcy = 0.5,
        hjust = 0, vjust = 1, table.rownames = TRUE, table.theme = ttheme_minimal(core = list(bg_params = list(col = "#2dc46b"),
            fg_params = list(col = "#2dc46b")), colhead = list(fg_params = list(col = "#2dc46b",
            fontface = "bold")), rowhead = list(fg_params = list(col = "#2dc46b",
            fontface = "bold")))) + scale_x_continuous(expand = expansion(add = c(0.001,
    0.05))) + scale_y_continuous(expand = expansion(add = c(0.001,
    0.05))) + facet_wrap(~subgroup2, ncol = 1, scales = "fixed") +
    ylab("True Positive Rate\n") + xlab("\nFalse Positive Rate") +
    scale_color_manual(values = c("#2dc46b")) + guides(color = guide_legend("Groups",
    byrow = T, override.aes = list(size = 5)))

fig_5b

Figure 5D: Enterococcus Infection and ROC

ebac_infx_confusion <- expan_infx_stats %>%
    filter(metric %in% c("ebac_infx_ebac_below_cutpoint", "ebac_infx_ebac_above_cutpoint",
        "ebac_noinfx_ebac_below_cutpoint", "ebac_noinfx_ebac_above_cutpoint")) %>%
    select(metric, count) %>%
    pivot_wider(names_from = metric, values_from = count)

ebac_infx_confusion_cnfs <- data.frame(Enterobacterales = c("Infection",
    "No Infection"), Expansion = c(ebac_infx_confusion$ebac_infx_ebac_above_cutpoint,
    ebac_infx_confusion$ebac_noinfx_ebac_above_cutpoint), `No Expansion` = c(ebac_infx_confusion$ebac_infx_ebac_below_cutpoint,
    ebac_infx_confusion$ebac_noinfx_ebac_below_cutpoint))

ebac_infx_confusion_cnfs <- ebac_infx_confusion_cnfs %>%
    mutate(sensitivity = round(ebac_infx_confusion_cnfs[1, 2]/(ebac_infx_confusion_cnfs[1,
        2] + ebac_infx_confusion_cnfs[1, 3]), 3), specificity = round(ebac_infx_confusion_cnfs[2,
        3]/(ebac_infx_confusion_cnfs[2, 3] + ebac_infx_confusion_cnfs[2,
        2]), 3), odds_ratio = round((ebac_infx_confusion_cnfs[1,
        2]/ebac_infx_confusion_cnfs[1, 3])/(ebac_infx_confusion_cnfs[2,
        2]/ebac_infx_confusion_cnfs[2, 3]), 3))


fig_5d <- test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    ungroup() %>%
    unnest(roc_curve) %>%
    arrange(desc(AUC)) %>%
    select(-boot) %>%
    mutate(auc_label = paste0("AUC = ", formatC(round(AUC, 3),
        digits = 2, format = "f")), auc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
        ", ", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
            AUC, alpha = 0.05)[3][2, ], 2)), digits = 2, format = "f"),
        "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
            ", ", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
                AUC, alpha = 0.05)[3][2, ], 2)), digits = 2,
                format = "f"), "]")), acc_label = paste0("Accuracy = ",
        formatC(round(acc, 3) * 100, digits = 0, format = "f"),
        "%"), acc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0, format = "f"),
        "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0, format = "f"),
        "%"), "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
            format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
            format = "f"), "%"), "]")), sens_label = paste0("Sensitivity = ",
        formatC(round(sensitivity, 3) * 100, digits = 0, format = "f"),
        "%"), sens_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]")), spec_label = paste0("Specificity = ",
        formatC(round(specificity, 3) * 100, digits = 0, format = "f"),
        "%"), spec_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"))) %>%
    filter(grepl(pattern = "rel_abundance", x = Input)) %>%
    mutate(subgroup = str_to_title(subgroup), subgroup = gsub(pattern = "_",
        replacement = " ", x = subgroup), subgroup = gsub(pattern = "rel abundance",
        replacement = "Relative Abundance (%)", x = subgroup),
        subgroup2 = ifelse(grepl(x = subgroup, pattern = "enterococcus",
            ignore.case = TRUE), "Enterococcus", "Enterobacterales"),
        subgroup2 = factor(subgroup2, levels = c("Enterococcus",
            "Enterobacterales")), odds_label = paste0("OR = ",
            ebac_infx_confusion_cnfs$odds_ratio[1])) %>%
    filter(subgroup2 == "Enterobacterales") %>%
    ggplot(aes(x = fpr, y = tpr, color = subgroup2)) + geom_line(size = 1.2) +
    geom_text(aes(label = auc_label, x = 0.5, y = 0.16), show.legend = F,
        hjust = 0) + geom_text(aes(label = acc_label, x = 0.5,
    y = 0.12), show.legend = F, hjust = 0) + geom_text(aes(label = spec_label,
    x = 0.5, y = 0.08), show.legend = F, hjust = 0) + geom_text(aes(label = sens_label,
    x = 0.5, y = 0.04), show.legend = F, hjust = 0) + geom_text(aes(label = paste0("Cutpoint = ",
    round((optimal_cutpoint), digits = 3) * 100, "%"), x = 0.5,
    y = 0.2), hjust = 0, show.legend = F) + geom_text(aes(label = odds_label,
    x = 0.5, y = 0), show.legend = F, hjust = 0) + theme_bw() +
    theme(axis.text = et(color = "black", size = 12), axis.title = et(color = "black",
        size = 14), legend.text = et(size = 12), legend.title = et(size = 14),
        legend.spacing.y = unit(0.5, "cm"), legend.position = "none",
        panel.grid = eb(), strip.text = et(size = 14, color = "red",
            face = "bold"), strip.background = eb(), panel.border = eb(),
        panel.spacing.y = unit(20, "mm"), axis.line.x = el(color = "black")) +
    geom_vline(xintercept = -0.05) + geom_hline(yintercept = -0.05) +
    geom_table_npc(data = ebac_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterobacterales") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.)), label = list(ebac_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterobacterales") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.))), npcx = 0.25, npcy = 0.5,
        hjust = 0, vjust = 1, table.rownames = TRUE, table.theme = ttheme_minimal(core = list(bg_params = list(col = "red"),
            fg_params = list(col = "red")), colhead = list(fg_params = list(col = "red",
            fontface = "bold")), rowhead = list(fg_params = list(col = "red",
            fontface = "bold")))) + scale_x_continuous(expand = expansion(add = c(0.001,
    0.05))) + scale_y_continuous(expand = expansion(add = c(0.001,
    0.05))) + facet_wrap(~subgroup2, ncol = 1, scales = "fixed") +
    ylab("True Positive Rate\n") + xlab("\nFalse Positive Rate") +
    scale_color_manual(values = c("red")) + guides(color = guide_legend("Groups",
    byrow = T, override.aes = list(size = 5)))

fig_5d

Enterococcus Expansion: Volcano Analysis

# Heatmap compounds and their categories
heatmap_lookup <- read.csv("./Data/qual_heatmap_lookup.csv",
    stringsAsFactors = FALSE)

# Build heatmap compound list
heatmap_cmpds <- metab_qual_anon %>%
    mutate(compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_lookup$compound) %>%
    distinct(compound) %>%
    drop_na()

qual_log2fc_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue != 0)) %>%
    summarise(log2fc_val = log((mean(mvalue[enterococcus_expansion ==
        "0"], na.rm = T)/mean(mvalue[enterococcus_expansion ==
        "1"], na.rm = T)), base = 2))  # 0 = No Expansion, 1 = Expansion


qual_pval_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue != 0)) %>%
    rstatix::wilcox_test(mvalue ~ enterococcus_expansion) %>%
    rstatix::adjust_pvalue(method = "BH") %>%
    rstatix::add_significance("p.adj")

qual_tot_ecoc_expan <- left_join(qual_log2fc_ecoc_expan, qual_pval_ecoc_expan) %>%
    column_to_rownames(var = "compound")

# volcano label color
ecoc_expan_volcano_labcol <- qual_tot_ecoc_expan %>%
    filter(p.adj <= 0.05 & abs(log2fc_val) >= 0.75) %>%
    mutate(color = ifelse(log2fc_val < 0, "#389458", "gray47"))

# # Volcano Plot (adjusted) set.seed(456)
# volcano_adj_ecoc_expan <-
# EnhancedVolcano(qual_tot_ecoc_expan, lab =
# rownames(qual_tot_ecoc_expan), x = 'log2fc_val', y =
# 'p.adj', title = NULL, pCutoff = 0.05, FCcutoff = 0.75,
# pointSize = 6, labSize = 8, axisLabSize = 32, labCol =
# ecoc_expan_volcano_labcol$color, caption = NULL, colAlpha
# = 0.65, col = c('gray75', c('#D4CA15', '#912777',
# '#1238E3')), xlim = c(-2.5, 4), ylim = c(0, 8),
# legendPosition = 'none', legendLabels =
# c(expression(p.adj > 0.05*';' ~ Log[2] ~ FC <
# '\u00B1'*0.75), expression(p.adj > 0.05*';' ~ Log[2] ~ FC
# >= '\u00B1'*0.75), expression(p.adj <= 0.05*';' ~ Log[2]
# ~ FC < '\u00B1'*0.75), expression(p.adj <= 0.05*';' ~
# Log[2] ~ FC >= '\u00B1'*0.75)), legendLabSize = 14,
# boxedLabels = T, drawConnectors = T, widthConnectors =
# 0.2, arrowheads = F, gridlines.minor = F, gridlines.major
# = F, max.overlaps = Inf ) + theme( axis.text = et(color =
# 'black'), legend.text = et(hjust = 0), plot.margin =
# unit(c(0, 0, 0, 0), 'cm') ) + labs(subtitle = NULL) +
# annotate('segment', x = 0.8, xend = 3, y = 7.95, yend =
# 7.95, arrow = arrow(), size = 2, color = 'gray67') +
# annotate('text', x = 1.65, y = 8.15, label = 'No
# Expansion', size = 9, color = 'gray67') +
# annotate('rect', xmin = 0.75, xmax = Inf, ymin =
# -log(0.05, base = 10), ymax = Inf, alpha = .1, fill =
# 'gray87') + annotate('segment', x = -0.8, xend = -2.5, y
# = 7.95, yend = 7.95, arrow = arrow(), size = 2, color =
# '#389458') + annotate('text', x = -1.55, y = 8.15, label
# = 'Expansion', size = 9, color = '#389458') +
# annotate('rect', xmin = -0.75, xmax = -Inf, ymin =
# -log(0.05, base = 10), ymax = Inf, alpha = .1, fill =
# '#389458') + guides(color = guide_legend(nrow = 4), shape
# = guide_legend(nrow = 4)) + scale_y_continuous(expand =
# expansion(add = c(0, 0.15))) volcano_adj_ecoc_expan
# ggsave(plot = volcano_adj_ecoc_expan, filename =
# './Results/Figure_5A.pdf', width = 24, height = 11)

Enterococcus Expansion: Distribution

# Using unadjusted p-values for down-selection of
# metabolites, show distribution of normalized peak area
boxplot_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0), enterococcus_expansion = ifelse(enterococcus_expansion ==
        1, "Expansion", "No Expansion")) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "No Expansion"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "Expansion"])) %>%
    filter(any(mvalue != 0)) %>%
    right_join(qual_tot_ecoc_expan %>%
        rownames_to_column(var = "compound") %>%
        mutate(abs_log2fc_val = abs(log2fc_val)) %>%
        filter(p <= 0.05, abs_log2fc_val >= 1))

if (nrow(boxplot_ecoc_expan) > 0) {
    print(ggpar(ggboxplot(boxplot_ecoc_expan, x = "enterococcus_expansion",
        y = "mvalue", color = "enterococcus_expansion", palette = c("#389458",
            "gray87"), ylab = "Normalized Peak Area", xlab = "",
        outlier.shape = NA), legend.title = "Enterococcus") +
        stat_compare_means(label.y.npc = 0.75) + facet_wrap(~compound,
        scales = "free_y") + geom_point(data = boxplot_ecoc_expan,
        aes(x = enterococcus_expansion, y = mvalue, color = enterococcus_expansion),
        position = position_jitter(width = 0.2), size = 2, alpha = 0.65))

    ggsave(filename = "./Results/enterococcus_expansion_boxplot.pdf",
        width = 24, height = 18)
} else {
    print("no significant observations")
}

taxUMAP Data Preparation

# Custom colors
pirate_colors <- c("#1A49BE", "#3A001E")
pirate_colors2 <- c("#3A001E", "#3A001E", "#3A001E", "#1A49BE")

t_metaphlan <- metaphlan_df %>%
    filter(sampleID %in% first_samps$sampleID | grepl(sampleID,
        pattern = "hd")) %>%
    mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant",
        "Healthy Donor")) %>%
    select(sampleID, taxid, db, pctseqs, Total) %>%
    group_by(sampleID, taxid, pctseqs) %>%
    slice(1) %>%
    ungroup() %>%
    filter(pctseqs >= 1e-04) %>%
    group_by(sampleID) %>%
    dplyr::add_count(taxid, name = "totalSp") %>%
    mutate(sampleID_count = length(unique(sampleID)), spPres = totalSp/sampleID_count) %>%
    filter(spPres >= 0.1) %>%
    select(-c(Total, sampleID_count, spPres, totalSp)) %>%
    group_by(sampleID) %>%
    mutate(pctseqs = pctseqs/sum(pctseqs))


t_metaphlan_mat <- t_metaphlan %>%
    distinct() %>%
    pivot_wider(names_from = "taxid", values_from = "pctseqs",
        values_fill = 0) %>%
    column_to_rownames(var = "sampleID") %>%
    select(-db)


# taxUMAP microbiota table
taxumap_microbiota <- t_metaphlan_mat %>%
    rownames_to_column(var = "index_column") %>%
    pivot_longer(-index_column, names_to = "taxid", values_to = "pctseq") %>%
    mutate(taxid = paste0("taxID", taxid)) %>%
    pivot_wider(names_from = "taxid", values_from = "pctseq")

write.csv(taxumap_microbiota, "./Results/microbiota_table.csv",
    row.names = FALSE)

# taxUMAP taxonomy table
taxumap_taxonomy <- t_metaphlan_mat %>%
    rownames_to_column(var = "index_column") %>%
    pivot_longer(-index_column, names_to = "taxid", values_to = "pctseq") %>%
    left_join(tax_lookup %>%
        mutate(taxid = as.character(taxid))) %>%
    mutate(taxid = paste0("taxID", taxid)) %>%
    distinct(Kingdom, Phylum, Class, Order, Family, Genus, Species,
        taxid) %>%
    transmute(OTU = taxid, Kingdom, Phylum = if_else(Phylum ==
        "", "unclassified", Phylum), Class = if_else(Class ==
        "", "unclassified", Class), Order = if_else(Order ==
        "", "unclassified", Order), Family = if_else(Family ==
        "", "unclassified", Family), Genus = if_else(Genus ==
        "", "unclassified", Genus), Genus = gsub(" ", "_", Genus),
        Species = if_else(Species == "", "unclassified", Species),
        Species = gsub(" ", "_", Species))
write.csv(taxumap_taxonomy, "./Results/taxonomy.csv", row.names = FALSE)

TaxUMAP Analysis

import sys

print(sys.path)

# USER MUST SET THEIR PATH TO THEIR LOCALLY INSTALLED TAXUMAP LOCATION
## ['', '/usr/bin', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python38.zip', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/lib-dynload', '/Users/nick/Library/Python/3.8/lib/python/site-packages', '/opt/anaconda3/bin/taxumap', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/site-packages', '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/reticulate/python']
sys.path.insert(0, '/Users/nick/Documents/taxumap/taxumap/')

from taxumap.taxumap_base import Taxumap

# From file
tu = Taxumap(taxonomy='./Results/taxonomy.csv', microbiota_data='./Results/microbiota_table.csv',random_state=456)

# Transform the data (an inplace function)
## Phylum Class
## not monophyletic
## Class Order
## not monophyletic
## Order Family
## not monophyletic
## Family Genus
## not monophyletic
## post validate inputs main                Kingdom  ...                          Species
## ASV                     ...                                 
## taxID817      Bacteria  ...             Bacteroides_fragilis
## taxID818      Bacteria  ...     Bacteroides_thetaiotaomicron
## taxID820      Bacteria  ...            Bacteroides_uniformis
## taxID821      Bacteria  ...             Phocaeicola_vulgatus
## taxID823      Bacteria  ...       Parabacteroides_distasonis
## ...                ...  ...                              ...
## taxID28447    Bacteria  ...        Clavibacter_michiganensis
## taxID33968    Bacteria  ...  Leuconostoc_pseudomesenteroides
## taxID709323   Bacteria  ...         Fructobacillus_tropaeoli
## taxID1070421  Bacteria  ...            Periweissella_fabalis
## taxID2749962  Bacteria  ...         Lactococcus_paracarnosus
## 
## [672 rows x 7 columns]
## 
## /opt/anaconda3/bin/taxumap/taxumap/dataloading.py:86: UserWarning: Reading taxonomy table. Assuming columns are ordered by phylogeny with in descending order of hierarchy:
##                  e.g. Kingdom, Phylum, ... , Genus, Species, etc.
##                  Additionally, the OTU or ASV column must be labeled as 'OTU' or 'ASV' unless otherwise specified
##   warnings.warn(
tu.transform_self(neigh=55)

# Raw embedding dataframe
## Taxumap(agg_levels = ['Phylum', 'Family'], weights = [1, 1])
## 
## /opt/anaconda3/bin/taxumap/taxumap/taxumap_base.py:91: UserWarning: Setting min_dist to 0.05/sum(weights)
##   warnings.warn("Setting min_dist to 0.05/sum(weights)")
## /opt/anaconda3/bin/taxumap/taxumap/taxumap_base.py:102: UserWarning: Setting epochs to 5000
##   warnings.warn("Setting epochs to %d" % epochs)
## /opt/anaconda3/bin/taxumap/taxumap/tools.py:47: UserWarning: aggregating on Phylum
##   warnings.warn("aggregating on %s" % agg_level)
## /opt/anaconda3/bin/taxumap/taxumap/tools.py:47: UserWarning: aggregating on Family
##   warnings.warn("aggregating on %s" % agg_level)
embedding_df = tu.df_embedding

Figure 1: Shotgun Metagenomics, Bile Acid Genes, taxUMAP and Diversity

#### Alpha Diversity ####
# Alpha diversity matrix: Inverse Simpson
alpha_invsim <- vegan::diversity(t_metaphlan_mat, index = "invsimpson") %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("InvSimpson" = ".")

# Alpha Diversity matrix: Shannon
alpha_shannon <- vegan::diversity(t_metaphlan_mat, index = "shannon") %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("Shannon" = ".")

# Alpha Diversity matrix: Observed ASVs
alpha_richness <- vegan::specnumber(t_metaphlan_mat) %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("Richness" = ".")

#### taxUMAP #####
# Find most abundant taxa per sample and plot just that
top_tax <- t_metaphlan %>%
  group_by(sampleID) %>%
  slice_max(pctseqs, n = 1) %>%
  left_join(tax_lookup) %>% 
  mutate(across(everything(), ~ifelse(.=="", NA, as.character(.)))) %>% 
  replace_na(list(Species="unclassified",
                  Genus="unclassified",
                  Family="unclassified",
                  Order="unclassified",
                  Class="unclassified",
                  Phylum="unclassified")) %>%
  mutate(Genus=ifelse(Genus=="unclassified",
                      paste(Family,Genus,sep="\n"),
                      as.character(Genus)),
         pctseqs = as.numeric(pctseqs))


metaphlan_df2 <- t_metaphlan %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  left_join(tax_lookup) %>% 
  drop_na(taxid) %>% 
  arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
  mutate(Genus = paste0(Phylum,"-",Order,"-", Family, "-",Genus)) %>% 
  left_join(alpha_shannon) %>%
  group_by(sampleID) %>% 
  arrange(Genus) %>% 
  mutate(cum.pct = cumsum(pctseqs), 
         y.text = (cum.pct + c(0, cum.pct[-length(cum.pct)]))/2) %>% 
  ungroup() %>%
  dplyr::select(-cum.pct)

metaphlan_df_sumry <- 
  metaphlan_df2 %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  ungroup() %>% 
  mutate(healthy_min_shannon = min(Shannon[db == "Healthy Donor"]),
         diversity_group = case_when(
           db == "Healthy Donor" ~ "Healthy Donor",
           Shannon >= healthy_min_shannon ~ "High Diversity",
           TRUE ~ "TBD"
         ),
         lt_med_shannon = median(Shannon[diversity_group == "TBD"], na.rm = TRUE),
         diversity_group = case_when(
           diversity_group %in% c("Healthy Donor", "High Diversity") ~ diversity_group,
           Shannon >= lt_med_shannon ~ "Medium Diversity",
           TRUE ~ "Low Diversity"
         ),
         diversity_group = factor(
           diversity_group,
           levels = c(
             "Low Diversity",
             "Medium Diversity",
             "High Diversity",
             "Healthy Donor"
           )
         ),
         diversity_group_abv = gsub(pattern = " Diversity",
                                    replacement = "",
                                    x = diversity_group),
         diversity_group_abv = factor(diversity_group_abv,
                                      levels = c("Low", "Medium", "High", "Healthy Donor"))) %>% 
  arrange(Genus)


tax_umap_mat_plot <- py$embedding_df %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(t_metaphlan %>% 
              distinct(sampleID, db)) %>%
  left_join(top_tax) %>% 
  ggplot(aes(x = taxumap1, y = taxumap2, color = Genus, shape = db, size = pctseqs*100, alpha = db))+
  geom_point(fill = "black")+
  theme_bw() +
  theme(panel.grid = eb(),
        axis.title = et(color = "black", size = 14),
        axis.text = et(color = "black", size = 12),
        axis.line = el(color = "black"),
        # legend.title = et(color = "black", size = 14),
        # legend.text = et(color = "black", size = 12),
        legend.position = "right"
        ) + 
  xlab("taxUMAP1") +
  ylab("taxUMAP2") +
  guides(
    shape = guide_legend(
      title = "Cohort",
      override.aes = list(size = 4),
      order = 1,
      ncol = 1,
      title.position = "top"
    ),
    size = guide_legend(
      title = "Relative Abundance (%)",
      order = 2,
      ncol = 1,
      title.position = "top"
    ),
    color = "none",
    fill = "none",
    alpha = "none"
  )+
  scale_color_manual(values = metaphlan_pal)+
  scale_shape_manual(values = c(24, 16))+
  scale_alpha_manual(values = c(1, 0.45))

tax_umap_mat_plot

ggsave(plot = tax_umap_mat_plot, 
            filename = "./Results/Figure_1B.pdf",
            height = 8, width = 8)

#### Alpha Diversity Stats #####
# Obtain stats for alpha diversity
diversity_comps <- list(
  c("Healthy Donor", "High"),
  c("High", "Medium"),
  c("High", "Low"),
  c("Medium", "Low")
  )

alpha_stats <- alpha_invsim %>% 
  left_join(alpha_shannon) %>% 
  left_join(alpha_richness) %>% 
  inner_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group_abv)
             ) %>% 
  pivot_longer(!c(sampleID, db, diversity_group_abv),names_to = "diversity_metric", values_to = "value") %>% 
  group_by(diversity_metric) %>% 
  rstatix::wilcox_test(value~diversity_group_abv,
                        comparisons = diversity_comps,
                       p.adjust.method = "BH",
                       alternative= "two.sided"
                       ) %>% 
  ungroup()

# Create dataframe for all phylogentic levels of interest
phylo_rel_abd <- t_metaphlan %>% 
  left_join(tax_lookup) %>% 
  inner_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group_abv)
             ) %>% 
  mutate(Species = paste(Kingdom, Phylum, Class, Order, Family, Genus, Species, sep = "|")) %>% 
  filter(grepl(pattern = "Enterococcus|Enterobacterales|Bacteroidetes|Lachnospiraceae|Oscillospiraceae", x = Species)) %>% 
  mutate(organism = case_when(grepl(pattern = "Enterococcus", x = Species) ~ "Enterococcus",
                              grepl(pattern = "Enterobacterales", x = Species) ~ "Enterobacterales",
                              grepl(pattern = "Bacteroidetes", x = Species) ~ "Bacteroidetes",
                              grepl(pattern = "Lachnospiraceae", x = Species) ~ "Lachnospiraceae",
                              grepl(pattern = "Oscillospiraceae", x = Species) ~ "Oscillospiraceae")) %>% 
  select(sampleID, db, diversity_group_abv, organism, pctseqs) %>% 
  group_by(sampleID, db, diversity_group_abv, organism) %>% 
  summarise(pctseqs = sum(pctseqs)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = organism, values_from = pctseqs, values_fill = 0) %>% 
  pivot_longer(!c(sampleID, db, diversity_group_abv), names_to = "organism", values_to = "pctseqs")

# Obtain stats for all phylogentic levels of interest
rel_abd_alpha_stats <- phylo_rel_abd %>%  
  group_by(organism) %>% 
  rstatix::wilcox_test(pctseqs~diversity_group_abv) %>% 
  bind_rows(alpha_stats) %>% 
  rstatix::adjust_pvalue(method = "BH") %>% 
  mutate(p.adj = ifelse(p.adj < 0.001, 0.001, round(p.adj, 3)))

write.csv(rel_abd_alpha_stats, "./Results/Figure_1_Statistics.csv", row.names = FALSE)

symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, Inf), symbols = c("****", "***", "**", "*", "ns"))

diversity_group_colors <- c("#3A001E", "#8A0246", "#C20463", "#1A49BE")

# #### Plot Inverse Simpson ####
# 
# 
# set.seed(456)
# gg_alpha_invsim <- alpha_invsim %>% 
#   inner_join(t_metaphlan %>% 
#                distinct(sampleID, db) %>% 
#                select(sampleID, db)
#              ) %>% 
#   inner_join(metaphlan_df %>%
#                left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
#                distinct(sampleID, db, diversity_group, diversity_group_abv)
#              ) %>% 
#   mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
#   ggplot(., aes(x = diversity_group_abv, 
#                 y = InvSimpson, 
#                 color = diversity_group_abv, 
#                 fill = db)) +
#   geom_boxplot(outlier.colour = NA,
#                alpha = 0.35) +
#   geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
#   stat_compare_means(comparisons = diversity_comps,
#                      tip.length = 0.01,
#                      step.increase = 0.075,
#                      symnum.args = symnum.args,
#                      method.args = list(alternative = "two.sided",
#                                         exact = FALSE)
#                      ) +
#   theme_bw() +
#   theme(
#     panel.grid = eb(),
#     axis.title.y = et(size = 14, color = "black"),
#     axis.title.x = eb(),
#     axis.text = et(size = 12, color = "black"),
#     plot.margin = margin(t = 5,  # Top margin
#                          r = 5,  # Right margin
#                          b = 5,  # Bottom margin
#                          l = 5), # Left margin
#     panel.border = eb(),
#     axis.line = el(color = 'black')
#   ) +
#   ylab("Alpha Diversity\n(Inverse Simpson)") +
#   scale_fill_manual(values = rev(pirate_colors)) +
#   scale_color_manual(values = diversity_group_colors) +
#   guides(fill = guide_legend("Cohort"),
#          color = guide_legend("Diversity Group",
#                               override.aes = aes(label = "")))+
#   scale_y_continuous(breaks = seq(0,45,5),
#                      expand = expansion(mult = c(0.01, 0.1))) +
#   coord_cartesian(xlim = c(1.1,3.9))
# 
#   
# gg_alpha_invsim
# 
# pdf(file = "./Results/Figure_1C.pdf", height = 6, width = 7)
# gg_alpha_invsim
# invisible(dev.off())
# 
# #### Plot Shannon Diversity ####
# set.seed(456)
# gg_alpha_shannon <- alpha_shannon %>% 
#   inner_join(t_metaphlan %>% 
#                distinct(sampleID, db) %>% 
#                select(sampleID, db)
#              ) %>% 
#   inner_join(metaphlan_df %>%
#                left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
#                distinct(sampleID, db, diversity_group, diversity_group_abv)
#              ) %>% 
#   mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
#   ggplot(., aes(x = diversity_group_abv, 
#                 y = Shannon, 
#                 color = diversity_group_abv, 
#                 fill = db)) +
#   geom_boxplot(outlier.colour = NA,
#                alpha = 0.35) +
#   geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
#   stat_compare_means(comparisons = diversity_comps,
#                      tip.length = 0.01,
#                      step.increase = 0.075,
#                      symnum.args = symnum.args,
#                      method.args = list(alternative = "two.sided",
#                                         exact = FALSE)
#                      ) + 
#   theme_bw() +
#   theme(
#     panel.grid = eb(),
#     axis.title.y = et(size = 14, color = "black"),
#     axis.title.x = eb(),
#     axis.text = et(size = 12, color = "black"),
#     plot.margin = margin(t = 5,  # Top margin
#                          r = 5,  # Right margin
#                          b = 5,  # Bottom margin
#                          l = 5), # Left margin
#     panel.border = eb(),
#     axis.line = el(color = 'black')
#   ) +
#   ylab("Alpha Diversity\n(Shannon)") +
#   scale_fill_manual(values = rev(pirate_colors)) +
#   scale_color_manual(values = diversity_group_colors) +
#   guides(fill = guide_legend("Cohort"),
#          color = guide_legend("Diversity Group",
#                               override.aes = aes(label = "")))+
#   scale_y_continuous(breaks = seq(0,6.5,1),
#                      expand = expansion(mult = c(0.01, 0.1))) +
#   coord_cartesian(xlim = c(1.1,3.9))
#   
#   
# gg_alpha_shannon
# 
# pdf(file = "./Results/Figure_1D.pdf", height = 6, width = 7)
# gg_alpha_shannon
# invisible(dev.off())
# 
# #### Plot Species Richness ####
# set.seed(456)
# gg_alpha_richness <- alpha_richness %>% 
#   inner_join(t_metaphlan %>% 
#                distinct(sampleID, db) %>% 
#                select(sampleID, db)
#              ) %>% 
#   inner_join(metaphlan_df %>%
#                left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
#                distinct(sampleID, db, diversity_group, diversity_group_abv)
#              ) %>% 
#   mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
#   ggplot(., aes(x = diversity_group_abv, 
#                 y = Richness, 
#                 color = diversity_group_abv, 
#                 fill = db)) +
#   geom_boxplot(outlier.colour = NA,
#                alpha = 0.35) +
#   geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
#   stat_compare_means(comparisons = diversity_comps,
#                      tip.length = 0.01,
#                      label.y = c(145, 160, 150, 110),
#                      symnum.args = symnum.args,
#                      method.args = list(alternative = "two.sided",
#                                         exact = FALSE)
#                      ) +
#   theme_bw() +
#   theme(
#     panel.grid = eb(),
#     axis.title.y = et(size = 14, color = "black"),
#     axis.title.x = eb(),
#     axis.text = et(size = 12, color = "black"),
#     plot.margin = margin(t = 5,  # Top margin
#                          r = 5,  # Right margin
#                          b = 5,  # Bottom margin
#                          l = 5), # Left margin
#     panel.border = eb(),
#     axis.line = el(color = 'black')
#   ) +
#   ylab("Alpha Diversity\n(Richness)") +
#   scale_fill_manual(values = rev(pirate_colors)) +
#   scale_color_manual(values = diversity_group_colors) +
#   guides(fill = guide_legend("Cohort"),
#          color = guide_legend("Diversity Group",
#                               override.aes = aes(label = "")))+
#   scale_y_continuous(breaks = seq(0,180,50),
#                      expand = expansion(mult = c(0.01, 0.035))) +
#   coord_cartesian(xlim = c(1.1,3.9))
#   
#   
# gg_alpha_richness
# 
# pdf(file = "./Results/Figure_1E.pdf", height = 6, width = 7)
# gg_alpha_richness
# invisible(dev.off())

#### Plot Lachnospiraceae ####
set.seed(456)
gg_lach_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Lachnospiraceae") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.65, 0.71, 0.76, 0.82)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Lachnospiraceae")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_lach_rel_abd

# pdf(file = "./Results/Figure_1F.pdf", height = 6, width = 7)
# gg_lach_rel_abd
# invisible(dev.off())

#### Plot Bacteroidetes ####
set.seed(456)
gg_bact_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Bacteroidetes") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.65, 0.98, 0.92, 1.02)
                     ) + 
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Bacteroidetes")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_bact_rel_abd

# pdf(file = "./Results/Figure_1I.pdf", height = 6, width = 7)
# gg_bact_rel_abd
# invisible(dev.off())

#### Plot Oscillospiraceae ####
set.seed(456)
gg_oscl_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Oscillospiraceae") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.33, 0.52, 0.59, 0.66)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Oscillospiraceae")), paste("MetaPhlAn4 Relative Abundance"))) +
   scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_oscl_rel_abd

# pdf(file = "./Results/Figure_1J.pdf", height = 6, width = 7)
# gg_oscl_rel_abd
# invisible(dev.off())

#### Plot Enterococcus ####
set.seed(456)
gg_ecoc_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Enterococcus") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                    
                     label.y = c(0.20, 0.75, 0.985, 1.05)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Enterococcus")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_ecoc_rel_abd

# pdf(file = "./Results/Figure_1G.pdf", height = 6, width = 7)
# gg_ecoc_rel_abd
# invisible(dev.off())

#### Plot Enterobacterales #####
set.seed(456)
gg_ebac_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Enterobacterales") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.20, 0.75, 0.985, 1.05)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Enterobacterales")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_ebac_rel_abd

# pdf(file = "./Results/Figure_1H.pdf", height = 6, width = 7)
# gg_ebac_rel_abd
# invisible(dev.off())

#### Plot Metaphlan Relative Abundance ####
# Figure 1A-MetaPhlAn4 Taxonomy
# Load legend
tax_legend <- magick::image_read_pdf("./Data/legend.v2.pdf")

gg_tax_legend <- cowplot::ggdraw() + cowplot::draw_image(tax_legend)

metaphlan_pal2 <- getRdpPal(metaphlan_df2)

gg_metaphlan <- metaphlan_df2 %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
  ungroup() %>% 
  mutate(Genus = factor(Genus, levels = unique(Genus))) %>% 
  group_by(sampleID) %>% 
  arrange(Genus) %>% 
  ggplot(aes(x=reorder(sampleID, Shannon),y=pctseqs)) +
  geom_bar(stat="identity",aes(fill=Genus), width = 0.9) +
  scale_fill_manual(values = metaphlan_pal2) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x=eb(),
        axis.ticks.x=eb(),
        strip.text.x= et(angle=0,size=14),
        strip.background = eb(),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 5,
                             r = 5, 
                             b = 0, 
                             l = 5)) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005)),
                     labels = scales::percent_format(accuracy = 1)) +
  ylab("MetaPhlAn4 Relative Abundance") +
  xlab("")


# Color facets
gg_metaphlan_grob <- ggplot_gtable(ggplot_build(gg_metaphlan))
strip_both <- which(grepl('strip-', gg_metaphlan_grob$layout$name))
fills <- diversity_group_colors
k <- 1
for (i in strip_both) {
  l <- which(grepl('titleGrob', gg_metaphlan_grob$grobs[[i]]$grobs[[1]]$childrenOrder))
  gg_metaphlan_grob$grobs[[i]]$grobs[[1]]$children[[l]]$children[[1]]$gp$col <- fills[k]
  k <- k+1
}

gg_shannon <- metaphlan_df_sumry %>% 
  ggplot(aes(x=reorder(sampleID, Shannon), y = Shannon)) +
  geom_bar(stat="identity",aes(fill=diversity_group_abv), width = 0.9) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = eb(),
        axis.title.x = eb(),
        axis.ticks.x = eb(),
        strip.text = eb(),
        strip.background = er(fill = "white"),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 0,
                             r = 5, 
                             b = 0, 
                             l = 5),
        panel.grid = eb()) +
  scale_fill_manual(values = diversity_group_colors) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")

gg_metaphlan_shannon <-
plot_grid(
  gg_metaphlan_grob,
  gg_shannon,
  axis = "lb",
  align = "hv",
  nrow = 2,
  rel_heights = c(1, 0.15)
)

pdf(file = "./Results/Figure_1A.pdf", width = 12.25, height = 8)
gg_metaphlan_shannon 
invisible(dev.off())

#### Combine all into a single figure 1 Start ####
alpha_org_plot <- plot_grid(
  # gg_alpha_invsim + theme(legend.position = "none", 
  #                         axis.text.x = eb(), 
  #                         axis.title.x = eb(),
  #                         plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  # gg_alpha_shannon + theme(legend.position = "none", 
  #                          axis.text.x = eb(), 
  #                          axis.title.x = eb(),
  #                          plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  # gg_alpha_richness+ theme(legend.position = "none", 
  #                          axis.text.x = eb(),
  #                          axis.title.x = eb(),
  #                          plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  gg_lach_rel_abd + theme(legend.position = "none",
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85),
                          axis.title.x = eb(),
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_bact_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_oscl_rel_abd + theme(axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm"),
                          legend.position = "right"),
  gg_ecoc_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_ebac_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  tax_umap_mat_plot + 
    theme(plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  nrow = 2,
  axis = "lb",
  align = "v",
  rel_widths = c(1, 1, 1.3),
  rel_heights = c(1,1)
)

# This is useful to figure out the general layout of the plots
gs <- lapply(1:3, function(ii)
  grobTree(rectGrob(gp=gpar(fill=ii, alpha=0.5)), textGrob(ii)))

# grid.arrange(grobs=gs, ncol=4,
#                top="top label", bottom="bottom\nlabel",
#                left="left label", right="right label")

lay <- rbind(c(1,1,1,1,1,NA),
             c(1,1,1,1,1,2),
             c(1,1,1,1,1,NA),
             c(3,3,3,3,3,3),
             c(3,3,3,3,3,3),
             c(3,3,3,3,3,3),
             c(3,3,3,3,3,3))

# grid.arrange(grobs = gs, layout_matrix = lay)

pdf(file = "./Results/Figure_1.pdf", height = 16, width = 20)

grid.arrange(
  gg_metaphlan_shannon,                        # 1
  gg_tax_legend,                               # 2
  # tax_umap_mat_plot + 
  #   theme(
  #     legend.text = et(size = 10),
  #     legend.title = et(size = 12),
  #     plot.margin = margin(
  #       t = 5,  # Top margin
  #       r = 0,  # Right margin
  #       b = 75, # Bottom margin
  #       l = 5   # Left margin
  #     )
    # ),                                      # 2
  alpha_org_plot,                           # 3
  layout_matrix = lay
)

invisible(dev.off())

grid.arrange(
  gg_metaphlan_shannon,                        # 1
  gg_tax_legend,                               # 2
  # tax_umap_mat_plot + 
  #   theme(
  #     legend.text = et(size = 10),
  #     legend.title = et(size = 12),
  #     plot.margin = margin(
  #       t = 5,  # Top margin
  #       r = 0,  # Right margin
  #       b = 75, # Bottom margin
  #       l = 5   # Left margin
  #     )
    # ),                                      # 2
  alpha_org_plot,                           # 3
  layout_matrix = lay
)

#### Combine all into a single figure 1 End ####

Figure 4: Pathobiont Abundance and Bile Acid Genes

gg_metaphlan_pathos <- metaphlan_df2 %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
  mutate(Species = paste(Kingdom, Phylum, Class, Order, Family, Genus, Species, sep = "-")) %>% 
  ungroup() %>% 
  filter(diversity_group != "Healthy Donor") %>% 
  mutate(Species = case_when(grepl(x = Species, pattern = "Enterococcus") ~ "Enterococcus",
                             grepl(x = Species, pattern = "Enterobacterales") ~ "Enterobacterales",
                             TRUE ~ "Other")) %>% 
  group_by(sampleID, diversity_group, Shannon, Species) %>% 
  summarise(pctseqs = sum(pctseqs)) %>%
  mutate(pctseqs = ifelse(Species == "Other", 0, pctseqs)) %>% 
  ungroup() %>% 
  arrange(Species) %>% 
  mutate(Species = forcats::fct_relevel(Species)) %>%
  group_by(sampleID) %>%
  arrange(Species) %>% #distinct(sampleID)
  ggplot(aes(x=reorder(sampleID, Shannon),y=pctseqs)) +
  geom_bar(stat="identity",aes(fill=Species), width = 0.9) +
  scale_fill_manual("Pathobiont",
                    values = c("#FF0000", "#0C7A3A", "#00000000"),
                    breaks = c("Enterobacterales", "Enterococcus", "")) +
  # paletteer::scale_fill_paletteer_d(palette = "vapeplot::vaporwave"
  #                                     # "rcartocolor::Antique"
  #                                     # "MetBrewer::Renoir"
  #                                   ) +
  theme_bw() +
  theme(legend.position = "right",
        axis.text.x=eb(),
        axis.ticks.x=eb(),
        strip.text.x= et(angle=0,size=14),
        strip.background = eb(),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        panel.grid.minor = eb(),
        panel.grid.major.y = eb()) +
  facet_grid(. ~diversity_group, scales = "free_x")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005)),
                     labels = scales::percent_format(accuracy = 1)) +
  ylab("MetaPhlAn4 Relative Abundance") +
  xlab("")


# gg_metaphlan_pathos

#### Bile Acid Genes ####
ba_genes_wilcox_test <- ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  group_by(gene) %>% 
  wilcox_test(tpm ~ diversity_group) 
 
ba_genes_wilcox_effect <- ba_genes %>%
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  group_by(gene) %>% 
  wilcox_effsize(tpm ~ diversity_group)
 
gg_ba_genes <- ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  ungroup() %>% 
  ggplot(aes(x = reorder(sampleID, Shannon), y = tpm, fill = diversity_group)) +
  geom_col() +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 5,
                             r = 5, 
                             b = 0, 
                             l = 5)) +
  scale_fill_manual(values = diversity_group_colors) +
  facet_grid(gene~diversity_group, scales = "free_x")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005))) +
  ylab("Gene Abundance (TPM)") +
  xlab("")
 
# gg_ba_genes

pdf(file = "./Results/Metaphlan_Bile_Acid.pdf", height = 26, width = 20, onefile = FALSE)
gg.stack(gg_metaphlan,
         gg_ba_genes,
         heights = c(1, 8))
dev.off()
## quartz_off_screen 
##                 2
 # # Save gg_metaphlan as plotly html
# htmlwidgets::saveWidget(plotly::ggplotly(metaphlan_df2 %>%
#   left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
#   ungroup() %>% 
#   mutate(Genus = paste(Genus, Species, sep = "-"),
#          Genus = factor(Genus, levels = unique(Genus))) %>% 
#   group_by(sampleID) %>% 
#   arrange(Genus) %>% 
#   ggplot(aes(x=reorder(sampleID, Shannon),y=pctseqs)) +
#   geom_bar(stat="identity",aes(fill=Genus), width = 0.9) +
#   scale_fill_continuous(type = "viridis") +
#   theme_bw() +
#   theme(legend.position = "none",
#         axis.text.x=eb(),
#         axis.ticks.x=eb(),
#         strip.text.x= et(angle=0,size=14),
#         strip.background = eb(),
#         axis.title.y = et(color = "black", size = 14),
#         axis.text.y = et(color = "black", size = 12),
#         panel.spacing = unit(0.5, "lines"),
#         plot.margin = margin(t = 5,
#                              r = 5, 
#                              b = 0, 
#                              l = 5)) +
#   facet_grid(. ~diversity_group, scales = "free", space = "free")+
#   scale_y_continuous(expand = expansion(mult = c(0.005,0.005)),
#                      labels = scales::percent_format(accuracy = 1)) +
#   ylab("MetaPhlAn4 Relative Abundance") +
#   xlab("")), "./Results/Metaphlan_Species.html")


# Stats
gg_bile_gene_stats <-
ggstatsplot::grouped_ggbetweenstats(
  data = ba_genes %>%
    filter(grepl(patientID, pattern = "^lt")) %>% 
    mutate(gene = factor(
      gene,
      levels = c(
        "BaiA",
        "BaiA1",
        "BaiA2",
        "BaiB",
        "BaiCD",
        "BaiE",
        "BaiF",
        "BaiG",
        "BaiH",
        "BaiI",
        "BSH",
        "3aHSDH",
        "3bHSDH",
        "7aHSDH",
        "7bHSDH",
        "12abHSDH",
        "5AR",
        "5BR"
      )
    )) %>%
    filter(sampleID %in% metaphlan_df2$sampleID) %>%
    left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)),
  x = diversity_group,
  y = tpm,
  grouping.var = gene, 
  ggsignif.args  = list(textsize = 4, tip_length = 0.01),
  p.adjust.method  = "BH",
  type = "non-parametric",
  # pairwise.comparisons = FALSE,
  pairwise.display = "significant",
  results.subtitle = FALSE,
  ggplot.component = list(scale_fill_manual(values = diversity_group_colors),
                          scale_color_manual(values = diversity_group_colors)),
  plotgrid.args = list(nrow = 3),
  annotation.args = list(title = "Bile Acid Gene")
) #+
  # geom_signif(comparisons = list(c("Medium Diversity", "High Diversity"),
  #                                c("Medium Diversity", "Low Diversity")), 
  #             test = "wilcox.test")

pdf(file = "./Results/Bile_Acid_Stats.pdf", height = 20, width = 40, onefile = FALSE)
gg_bile_gene_stats
dev.off()
## quartz_off_screen 
##                 2
# Bile Genes Heatmap
gg_ba_genes_heatmap <- 
ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR")),
         gene = forcats::fct_rev(gene)) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  group_by(gene) %>% 
  mutate(tpm_zscore = scale(tpm, scale = TRUE, center = TRUE),
         tpm_log10 = log(tpm, base = 10),
         tpm_log10 = ifelse(is.infinite(tpm_log10), 0, tpm_log10)) %>%
  ggplot(aes(x = reorder(sampleID, Shannon), y = gene, fill = tpm_log10)) +
  geom_tile(stat = "identity", color = "gray") +
  theme_bw() +
  theme(
        panel.grid = eb(),
        plot.background = eb(),
        panel.border = er(colour = "black",
                          fill = NA,
                          linewidth = 0.5), 
        axis.title = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.line = el(color = "black")
  ) +
  facet_grid(. ~diversity_group, scales = "free_x")+
  ylab("Gene\n") +
  xlab("") +
  guides(fill = guide_legend(title = "TPM",
                             reverse = TRUE)) +
  scale_fill_gradient2(low = "white",
                       mid= "cyan1",
                       high = "#0c0970",
                       midpoint = 3.5,
                      breaks = seq(1, 7, by = 1),
                      # labels = c("1", "2", "3", "4", "5", "6", "7"),
                      labels = scales::label_math(),
                      limits = c(0, 7)
                    ) +
  # scale_fill_fermenter(palette = "YlGnBu", direction = 1) +
  # scale_x_discrete(expand=c(0,0))+
  scale_y_discrete(expand=c(0,0))

gg_ba_genes_bars <- 
ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  mutate(pres_abs = ifelse(tpm > 0, 1, 0)) %>% 
  group_by(sampleID, Shannon, diversity_group) %>% 
  summarise(pres_abs = sum(pres_abs)) %>% 
  # summarise(tot_pres_abs = sum(pres_abs)) %>% 
  # mutate(pct_pres = tot_pres_abs / length(unique(ba_genes$gene))) %>%
  ggplot(aes(x = reorder(sampleID, Shannon), y = pres_abs, fill = diversity_group)) +
  geom_col() +
  geom_hline(yintercept = 18, linetype = "dashed") +
  # annotate(geom = "text", x = 10, y = 18.5, label = "18 Genes Total") +
  theme_bw() +
  theme(
        panel.grid = eb(),
        plot.background = eb(),
        panel.border = er(colour = "black",
                          fill = NA,
                          linewidth = 0.5), 
        axis.title = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.line = el(color = "black"),
        legend.position = "none"
  ) +
  facet_grid(. ~diversity_group, scales = "free_x")+
  scale_fill_manual(values = diversity_group_colors) +
  ylab("Genes Present (Total) \n") +
  xlab("") +
  guides(fill = guide_legend(title = "Diversity Group",
                             reverse = TRUE)) +
  # scale_x_discrete(expand=c(0,0)) +
  scale_y_continuous(expand = c(0,0),
                     breaks = c(seq(0,15,5), 18),
                     limits = c(0,20),
                     labels = c("0", "5", "10", "15", "18 Total Genes"))

ba_genes_stats <- ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  mutate(pres_abs = ifelse(tpm > 0, 1, 0)) %>% 
  group_by(sampleID, Shannon, diversity_group) %>% 
  summarise(tot_pres_abs = sum(pres_abs)) %>% 
  mutate(pct_pres = tot_pres_abs / length(unique(ba_genes$gene))) %>% 
  ungroup() %>%
  kruskal_test(pct_pres ~ diversity_group)
  # wilcox_test(pct_pres ~ diversity_group, comparisons = list(
  #   c("High Diversity", "Medium Diversity"),
  #   c("High Diversity", "Low Diversity"),
  #   c("Medium Diversity", "Low Diversity")
  # ))

genes_stats_anno <- data.frame(diversity_group = c("Low Diversity",
                                                   "Medium Diversity",
                                                   "High Diversity"),
                               pct_pres = c(0.9, NA, NA),
                               label = c(paste0("Kruskal-Wallis, p = ", scales::scientific(ba_genes_stats$p)),
                                         NA,
                                         NA)) %>% 
  mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity",
                                                   "Medium Diversity",
                                                   "High Diversity")))

gg_ba_genes_box <-
ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  mutate(pres_abs = ifelse(tpm > 0, 1, 0)) %>% 
  group_by(sampleID, Shannon, diversity_group) %>% 
  summarise(tot_pres_abs = sum(pres_abs)) %>% 
  mutate(pct_pres = tot_pres_abs / length(unique(ba_genes$gene))) %>% 
  ggplot(aes(x = diversity_group, y = pct_pres, fill = diversity_group)) +
  geom_violin(trim = TRUE, alpha = 0.85) +
  geom_boxplot(alpha = 0.5, fill = "white", width = 0.4) +
  geom_jitter(color = "black", fill = "white", alpha = 0.5, shape = 21, size = 1.5, position = position_jitter(width = 0.15, seed = 123)) +
  geom_text(data = genes_stats_anno, label = genes_stats_anno$label, nudge_x = -0.35) +
  theme_bw() +
  theme(
        panel.grid = eb(),
        plot.background = eb(),
        panel.border = er(colour = "black",
                          fill = NA,
                          linewidth = 0.5), 
        axis.title = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.line = el(color = "black"),
        legend.position = "right",
  ) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")+
  scale_fill_manual(values = diversity_group_colors) +
  ylab("Genes Present (%) \n") +
  xlab("") +
  guides(fill = guide_legend(title = "Diversity Group"),
         color = guide_legend(title = "Diversity Group")) +
  scale_x_discrete(expand = expansion(mult = c(2, 2))) +
  scale_y_continuous(breaks = seq(0, 1, 0.25),
                     expand = expansion(mult = c(0.01, 0.03)),
                     labels = c("0%", "25%", "50%", "75%", "100%")) 

# pdf(file = "./Results/Figure_5.pdf", height = 14, width = 20, onefile = FALSE)
# gg.stack(gg_metaphlan_family,
#          # gg_metaphlan,
#          gg_ba_genes_heatmap,
#          gg_ba_genes_bars,
#          gg_ba_genes_box,
#          heights = c(0.2, 0.5, 0.2, 0.2),
#          gap = 2)
# dev.off()

pdf(file = "./Results/Figure_4.pdf", height = 10, width = 16, onefile = FALSE)
(
    (gg_metaphlan_pathos + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork:: plot_spacer() /
    (gg_ba_genes_heatmap + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork::plot_spacer() / 
    (gg_ba_genes_bars + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) / 
    patchwork::plot_spacer() / 
    (gg_ba_genes_box + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt")))
  ) + 
  patchwork::plot_layout(guides = 'collect', 
                         heights = c(0.15, 
                                     -0.03, # Spacer
                                     0.6,
                                     -0.03, # Spacer
                                     0.3, 
                                     -0.03, # Spacer
                                     0.3))
invisible(dev.off())

{
    ((gg_metaphlan_pathos + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork:: plot_spacer() /
    (gg_ba_genes_heatmap + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork::plot_spacer() / 
    (gg_ba_genes_bars + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) / 
    patchwork::plot_spacer() / 
    (gg_ba_genes_box + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt")))
  ) + 
  patchwork::plot_layout(guides = 'collect', 
                         heights = c(0.15, 
                                     -0.03, # Spacer
                                     0.6,
                                     -0.03, # Spacer
                                     0.3, 
                                     -0.03, # Spacer
                                     0.3))
}

Figure 2: Qualitative Metabolomics Heatmap

heatmap_data <- 
  t_metaphlan %>% 
  drop_na(taxid) %>% 
  select(sampleID) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant", "Healthy Donor")) %>%
  left_join(metab_qual_anon) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>% 
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  group_by(compound) %>% 
  mutate(median_val = ifelse(median(mvalue, na.rm = TRUE) == 0, min(mvalue[mvalue > 0], na.rm = TRUE)/10, median(mvalue, na.rm = TRUE)),
         heatmap_val = ifelse(log(mvalue/ median_val, base = 2) == -Inf, 0, log(mvalue/ median_val, base = 2))) %>% 
  ungroup() %>% 
  select(-c(mvalue, median_val)) %>% 
  group_by(sampleID, compound) %>% 
  slice_max(heatmap_val, with_ties = F, n = 1) %>% 
  ungroup() %>%
  select(-db) %>% 
  left_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>%
               distinct(sampleID, db, diversity_group_abv), by = "sampleID") %>% 
  group_by(sampleID, compound) %>% 
  slice_max(heatmap_val, with_ties = F, n = 1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup()

# Stats
heatmap_pvals <-
  heatmap_data %>%
  filter(diversity_group_abv != "Healthy Donor") %>% 
  drop_na(compound) %>%
  group_by(compound) %>%
  rstatix::kruskal_test(heatmap_val~diversity_group_abv) %>%
  rstatix::adjust_pvalue(method = "BH") %>%
  select(compound, statistic, p, p.adj) 

heatmap_labels <- 
  heatmap_data %>% 
  mutate(compound = str_to_title(compound)) %>% 
  left_join(heatmap_lookup) %>%
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  arrange(class, subclass, p) %>%
  arrange(class, subclass) %>% 
  pivot_wider(c(diversity_group_abv, sampleID, db, Shannon), names_from = "compound", values_from = "heatmap_val", values_fn = mean) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup() %>% 
  distinct(sampleID, .keep_all = TRUE) %>% 
  select(diversity_group_abv) %>% 
  mutate(
    diversity_group_abv = as.character(diversity_group_abv),
    diversity_group_abv = ifelse(
      grepl(pattern = "Healthy", as.character(diversity_group_abv)),
      diversity_group_abv,
      paste(diversity_group_abv, "Diversity")
    ),
    diversity_group_abv = factor(diversity_group_abv, levels = c("Low Diversity", "Medium Diversity",
                                                                 "High Diversity", "Healthy Donor"))
  )
  
# Build heatmap compound order (row order) and compound class (row slice)
heatmap_order <- 
  heatmap_data %>% 
  ungroup() %>% 
  filter(compound %in% heatmap_cmpds$compound) %>% 
  distinct(compound) %>% 
  drop_na() %>% 
  left_join(heatmap_lookup) %>% 
  mutate(class = case_when(class %in% c("Dicarboxylic Acid") ~ "Fatty Acid",
                           class %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                           TRUE ~ class),
         subclass = case_when(subclass %in% c("Branched-Chain Fatty Acid",
                                              "Aminated Fatty Acid",
                                              "Long-Chain Fatty Acid",
                                              "Dicarboxylic Acid") ~ "Other Fatty Acid",
                              subclass == "Indole" ~ "Tryptophan Metabs.",
                              subclass %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                              TRUE ~ subclass
                              )) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  ungroup() %>% 
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      # "Dicarboxylic Acid",          
      "Amino Acid",                   # 2
      "Bile Acid",                    # 3
      "Indole",                       # 4
      "Add. Cmpds.",                  # 5
      # "Phenolic Aromatic",          
      # "Kynurine Pathway",             
      "Vitamin"                       # 6
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Other Fatty Acid",             # 2
      "Amino Acid",                   # 3
      # "Aminated Fatty Acid",        
      # "Long-Chain Fatty Acid",      
      # "Dicarboxylic Acid",          
      "Primary Bile Acid",            # 4
      "Secondary Bile Acid",          # 5
      "Conjugated Bile Acid",         # 6
      "Tryptophan Metabs.",           # 7
      # "Phenolic Aromatic",            # 
      # "Kynurine Pathway",             # 
      "Vitamin",                      # 8
      "Add. Cmpds."                   # 9
    )
  )) %>% 
  arrange(class, subclass, p, compound) %>%
  select(class, subclass, p, compound)
  
# Build heatmap patient order following same order as gg_metaphlan plot
heatmap_column_order <- heatmap_data %>% 
  group_by(patientID) %>% 
  dplyr::slice(1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>%  
  select(patientID) %>% 
  distinct(patientID) %>% 
  pull(patientID)

# P-value legend color
pvalue_col_fun = colorRamp2(c(0, 0.045), c("#75C236", "#E3E4E6"))

# Create heatmap for adjusted p-values
pvalue_adj <-
heatmap_pvals %>% 
  group_by(compound, p.adj, p) %>% 
  dplyr::slice(1) %>% 
  ungroup() %>% 
  left_join(heatmap_lookup) %>%
  mutate(class = case_when(class %in% c("Dicarboxylic Acid") ~ "Fatty Acid",
                           class %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                           TRUE ~ class),
         subclass = case_when(subclass %in% c("Branched-Chain Fatty Acid",
                                              "Aminated Fatty Acid",
                                              "Long-Chain Fatty Acid",
                                              "Dicarboxylic Acid") ~ "Other Fatty Acid",
                              subclass == "Indole" ~ "Tryptophan Metabs.",
                              subclass %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                              TRUE ~ subclass
                              )) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  ungroup() %>% 
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      # "Dicarboxylic Acid",          
      "Amino Acid",                   # 2
      "Bile Acid",                    # 3
      "Indole",                       # 4
      "Add. Cmpds.",                  # 5
      # "Phenolic Aromatic",          
      # "Kynurine Pathway",             
      "Vitamin"                       # 6
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Other Fatty Acid",             # 2
      "Amino Acid",                   # 3
      # "Aminated Fatty Acid",        
      # "Long-Chain Fatty Acid",      
      # "Dicarboxylic Acid",          
      "Primary Bile Acid",            # 4
      "Secondary Bile Acid",          # 5
      "Conjugated Bile Acid",         # 6
      "Tryptophan Metabs.",           # 7
      # "Phenolic Aromatic",            # 
      # "Kynurine Pathway",             # 
      "Vitamin",                      # 8
      "Add. Cmpds."                   # 9
    )
  )) %>% 
  arrange(class, subclass, p.adj, compound) %>%
  select(class, subclass, p.adj, compound) %>% 
  column_to_rownames(var = "compound") %>% 
  select(`Adjusted p-value` = p.adj) %>% 
  as.matrix() %>% 
Heatmap(name = "Significance (adjusted p-value)",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        col = pvalue_col_fun,        
        column_title = "KW Test",
        column_title_side = "bottom",
        column_title_rot = 90,
        show_column_names = FALSE,
        column_names_side = "top",
        rect_gp = gpar(col = "black", lwd = 0.2),
        row_gap = unit(3.5, "mm"),
        row_names_side = "left",
        row_order = heatmap_order$compound,
        row_split = heatmap_order$subclass,
        row_names_gp = gpar(fontsize = 19), # Compounds on y-axis
        show_row_names = TRUE,
        row_title_rot = 0,
        row_title_gp = gpar(fontsize = 0),
        heatmap_legend_param = list(at = seq(0.05, 0, -0.01)),
        width = unit(0.1, "in")
)

# Heatmap legend color
col_fun <- colorRamp2(breaks = c(-5, 0, 5), colors = c("#00aaad", "white", "#ad003a"))

# Global parameter for annotation 
# ht_opt$COLUMN_ANNO_PADDING <- unit(2.5, "mm")

gg_metab_heatmap <-
  heatmap_data %>% 
  mutate(compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>%
  left_join(heatmap_lookup) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1) %>% select(compound, p.adj, p), by = "compound") %>%
  mutate(class = case_when(class %in% c("Dicarboxylic Acid") ~ "Fatty Acid",
                           class %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                           TRUE ~ class),
         subclass = case_when(subclass %in% c("Branched-Chain Fatty Acid",
                                              "Aminated Fatty Acid",
                                              "Long-Chain Fatty Acid",
                                              "Dicarboxylic Acid") ~ "Other Fatty Acid",
                              subclass == "Indole" ~ "Tryptophan Metabs.",
                              subclass %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                              TRUE ~ subclass
                              )) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  ungroup() %>% 
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      # "Dicarboxylic Acid",          
      "Amino Acid",                   # 2
      "Bile Acid",                    # 3
      "Indole",                       # 4
      "Add. Cmpds.",                  # 5
      # "Phenolic Aromatic",          
      # "Kynurine Pathway",             
      "Vitamin"                       # 6
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Other Fatty Acid",             # 2
      "Amino Acid",                   # 3
      # "Aminated Fatty Acid",        
      # "Long-Chain Fatty Acid",      
      # "Dicarboxylic Acid",          
      "Primary Bile Acid",            # 4
      "Secondary Bile Acid",          # 5
      "Conjugated Bile Acid",         # 6
      "Tryptophan Metabs.",           # 7
      # "Phenolic Aromatic",            # 
      # "Kynurine Pathway",             # 
      "Vitamin",                      # 8
      "Add. Cmpds."                   # 9
    )
  )) %>%
  mutate(
    diversity_group_abv = as.character(diversity_group_abv),
    diversity_group_abv = ifelse(
      grepl(pattern = "Healthy", as.character(diversity_group_abv)),
      diversity_group_abv,
      paste(diversity_group_abv, "Diversity")
    ),
    diversity_group_abv = factor(diversity_group_abv, levels = c("Low Diversity", "Medium Diversity",
                                                                 "High Diversity", "Healthy Donor"))
  ) %>% 
  arrange(class, subclass, p.adj, compound) %>%
  pivot_wider(c(diversity_group_abv, db, patientID, Shannon), names_from = compound, values_from = heatmap_val) %>%
  group_by(patientID) %>% 
  arrange(patientID) %>% 
  ungroup() %>% 
  select(-`NA`) %>%
  distinct(patientID, .keep_all = TRUE) %>%
  group_by(diversity_group_abv) %>% 
  arrange(db, Shannon) %>% 
  ungroup() %>% 
  select(-Shannon, -db, -diversity_group_abv) %>% 
  column_to_rownames("patientID") %>%
  as.matrix() %>% 
  t() %>%
  Heatmap(
    name = "Fold Change (log2)",
    col = col_fun,
    na_col = "grey83",
    rect_gp = gpar(col = "grey40", lwd = 1.5),
    # column_names_gp = grid::gpar(fontsize = 16),
    column_gap = unit(2.5, "mm"),
    column_split = heatmap_labels,
    column_order = heatmap_column_order,
    column_title_gp = gpar(fontsize = 20), # Diversity group labels on top of plot
    column_title_rot = 0,
    cluster_columns = FALSE,
    show_column_names = FALSE,
    show_column_dend = FALSE,
    row_names_gp = gpar(fontsize = 16), # Compounds on y-axis
    row_title_gp = gpar(fontsize = 18), # Compound classes on y-axis
    row_title_side = "right",  # Place the compound classes on right side of y-axis 
    row_gap = unit(3.5, "mm"),
    row_names_side = c("left"),
    row_order = heatmap_order$compound,
    row_split = heatmap_order$subclass,
    show_row_names = TRUE,
    row_title_rot = 0,
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    heatmap_height = unit(24, "in"),
    heatmap_width =  unit(16.5, "in")
  )

gg_metab_heatmap_tot <- pvalue_adj + gg_metab_heatmap
# gg_metab_heatmap_tot


pdf(file = "./Results/Figure_2.pdf", height = 24, width = 24, onefile = F)
gg_metab_heatmap_tot
invisible(dev.off())

Supplemental Figure 2: Quant Metabolomics-Healthy Donor vs LT Patients

# Conversions of compounds
metab_conversions <- data.frame(
  compound = c(
    "Taurocholic Acid",
    "Glycocholic Acid",
    "Cholic Acid",
    "3-Oxolithocholic Acid",
    "Alloisolithocholic Acid",
    "Deoxycholic Acid",
    "Isodeoxycholic Acid",
    "Lithocholic Acid",
    "Kynurenic Acid",
    "Kynurenine",
    "Anthranilic Acid",
    "Desaminotyrosine",
    "Niacin",
    "Tyrosine",
    "Tryptamine",
    "Tryptophan",
    "Phenylalanine",
    "Acetate",
    "Butyrate",
    "Propionate",
    "Succinate"
  ),
  molar_mass__gmol = c(
    515.7,
    465.6,
    408.6,
    374.6,
    376.6,
    392.6,
    392.6,
    376.6,
    189.17,
    208.21,
    137.14,
    166.17,
    123.11,
    181.19,
    160.22,
    204.22,
    165.19,
    59.04,
    87.1,
    73.07,
    116.07
  )
)

metab_quant_converted <- metab_quant_anon %>%
  mutate(
    compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
    compound = str_to_title(compound)
  ) %>%
  mutate(units = case_when(compound %in% c(
    "Taurocholic Acid",
    "Glycocholic Acid",
    "Cholic Acid",
    "3-Oxolithocholic Acid",
    "Alloisolithocholic Acid",
    "Deoxycholic Acid",
    "Isodeoxycholic Acid",
    "Lithocholic Acid"
  ) ~ "ugmL",
  compound %in% c("Acetate", "Butyrate", "Succinate", "Propionate") ~ "mM",
  TRUE ~ "uM")) %>% 
  right_join(metab_conversions, by = "compound") %>% # right_join to only keep compounds we're interested in
  mutate(mvalue__mM = case_when(units == "ugmL" ~ (mvalue*               #ugmL
                                                  ((1000/                #1000 ml per L
                                                    1000000)/            #1000000 ug per gram
                                                    molar_mass__gmol)*   #gram per mol (molar mass)
                                                    1000                 #1000 mM per 1M                                            
                                                   ),
                                units == "uM" ~ mvalue/                  #uM
                                                1000,                    #1000uM per 1M
                                TRUE ~ mvalue #units are already in mM
                                )
         ) %>% 
  select(sampleID, compound, mvalue__mM)
  

metab_boxplot <- 
  metaphlan_df2 %>% 
  left_join(metaphlan_df_sumry) %>% 
  drop_na(taxid) %>% 
  select(sampleID, diversity_group_abv, db) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  left_join(metab_quant_converted) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound)) %>% 
  ungroup() %>%
  mutate(class = case_when(compound %in% c("Taurocholic Acid", "Glycocholic Acid") ~ "Conjugated Primary Bile Acid",
                           compound %in% c("Cholic Acid") ~ "Primary Bile Acid",
                           compound %in% c("3-Oxolithocholic Acid", "Alloisolithocholic Acid", "Deoxycholic Acid", "Isodeoxycholic Acid",
                                                "Lithocholic Acid") ~ "Secondary Bile Acid",
                           compound %in% c("Threonine", "Glycine", "Tyrosine", "Tyramine", "Serine", "Leucine", "Isoleucine",
                                               "Valine", "Phenylalanine", "Alanine", "Proline", "Aspartate", 
                                               "Methionine", "Glutamate", "Lysine", "Cysteine", "Tryptophan") ~ "Amino Acid",
                           compound %in% c("Acetate", "Butyrate", "Succinate", "Propionate") ~ "Short-Chain Fatty Acid",
                           compound %in% c("Kynurenic Acid", "Anthranilic Acid", "Kynurenine", "Tryptamine") ~ "Kynurenine Metabolite",
                           compound == "Desaminotyrosine" ~ "Phenolic Aromatic",
                           compound == "Niacin" ~ "B-Vitamin",
                              TRUE ~ "Indole"),
         compound = case_when(class == "Conjugated Primary Bile Acid" ~ paste(compound, "(1ËšConj. BA)"),
                              class == "Primary Bile Acid" ~ paste(compound, "(1Ëš BA)"),
                              class == "Secondary Bile Acid" ~ paste(compound, "(2Ëš BA)"),
                              class == "Short-Chain Fatty Acid" ~ paste(compound, "(SCFA)"),
                              class == "Amino Acid" ~ paste(compound, "(AA)"),
                              class == "Phenolic Aromatic" ~ paste(compound, "(Phen. Arom.)"),
                              class == "Indole" ~ paste0(compound, "(Indole)"),
                              class == "Kynurenine Metabolite" ~ paste(compound, "(Kyn. Metab.)"),
                              class == "B-Vitamin" ~ paste(compound, "(B-Vitamin)")
                              )) %>% 
  drop_na() %>% 
  group_by(compound) %>% 
  mutate(count = length(unique(diversity_group_abv))) %>% 
  filter(count == 4) %>% 
  select(-count) %>%
  mutate(compound = factor(
    compound,
    levels = c(
      "Acetate (SCFA)",                  # 1
      "Propionate (SCFA)",               # 2
      "Butyrate (SCFA)",                 # 3
      "Succinate (SCFA)",                # 4
      "Tyrosine (AA)",                   # 5
      "Tryptophan (AA)",                 # 6
      "Phenylalanine (AA)",              # 7
      "Cholic Acid (1Ëš BA)",             # 8
      "Glycocholic Acid (1ËšConj. BA)",   # 9
      "Taurocholic Acid (1ËšConj. BA)",   # 10
      "Deoxycholic Acid (2Ëš BA)",        # 11
      "Lithocholic Acid (2Ëš BA)",        # 12
      "Isodeoxycholic Acid (2Ëš BA)",     # 13
      "3-Oxolithocholic Acid (2Ëš BA)",   # 14
      "Alloisolithocholic Acid (2Ëš BA)", # 15
      "Desaminotyrosine (Phen. Arom.)",  # 16
      "Kynurenine (Kyn. Metab.)",        # 17
      "Anthranilic Acid (Kyn. Metab.)",  # 18
      "Tryptamine (Kyn. Metab.)",        # 19
      "Niacin (B-Vitamin)"               # 20
    )
  ),
  class = factor(
    class,
    levels = c(
      "Short-Chain Fatty Acid",          # 1
      "Amino Acid",                      # 2
      "Primary Bile Acid",               # 3
      "Conjugated Primary Bile Acid",    # 4
      "Secondary Bile Acid",             # 5
      "Indole",                          # 6
      "Phenolic Aromatic",               # 7
      "Kynurenine Metabolite",           # 8
      "B-Vitamin"                        # 9
    )
  )) %>% 
  filter(compound != "Kynurenic Acid") %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor")),
         diversity_group_abv = factor(diversity_group_abv, levels = c("Low", "Medium", "High", "Healthy Donor"))) %>% 
  filter(compound %in% c(
    "Acetate (SCFA)",
    "Butyrate (SCFA)",
    "Propionate (SCFA)",
    "Glycocholic Acid (1ËšConj. BA)",
    "Taurocholic Acid (1ËšConj. BA)",
    "Cholic Acid (1Ëš BA)",
    "Deoxycholic Acid (2Ëš BA)",
    "Lithocholic Acid (2Ëš BA)",
    "Alloisolithocholic Acid (2Ëš BA)"
    ))


metab_boxplot_stats <-
  metab_boxplot %>% 
  group_by(class, compound) %>% 
  rstatix::wilcox_test(mvalue__mM~diversity_group_abv,
                       comparisons = diversity_comps,
                       p.adjust.method = "none",
                       alternative= "two.sided") %>% 
  rstatix::adjust_pvalue(method = "BH") %>%
  rstatix::add_significance("p.adj") %>% 
  mutate(p.adj = ifelse(p.adj < 0.001, "p.adj < 0.001", paste("p.adj = ", round(p.adj, 2)))) %>% 
  mutate(y.position = case_when(group1 == "High" & group2 == "Healthy Donor" ~ 2.30,
                                group1 == "Medium" & group2 == "High" ~ 2.7,
                                group1 == "Low" & group2 == "High" ~ 3.1,
                                group1 == "Low" & group2 == "Medium" ~ 3.5)) # Set stats brackets to fixed points since the y-scale will be log10 transformed

# Summary statistics
metab_boxplot_summay_stats <-
tbl_summary(
  metab_boxplot %>%
    ungroup() %>%
    select(sampleID, diversity_group_abv,  compound, mvalue__mM) %>%
    pivot_wider(
      id_cols = c(sampleID, diversity_group_abv),
      names_from = "compound",
      values_from = "mvalue__mM"
    ) %>% column_to_rownames(var = "sampleID"),
    by = diversity_group_abv,
    type = all_continuous() ~ "continuous2",
    statistic = all_continuous() ~ c("{mean} ({sd})", "{median} ({p25}, {p75})", "{min}-{max}"),
    digits = all_continuous() ~ function(x) format(x, digits = 3, scientific = TRUE)
  ) %>% 
  bold_labels() %>% 
  italicize_levels() 

metab_boxplot_summay_stats %>%
    gtsummary::modify_caption("**Diversity Group Summary Statistics**")
Diversity Group Summary Statistics
Characteristic Low, N = 36 Medium, N = 39 High, N = 26 Healthy Donor, N = 21
Acetate (SCFA)
    Mean (SD) 6.54e+00 (9.29e+00) 2.12e+01 (2.46e+01) 3.33e+01 (2.40e+01) 4.82e+01 (1.85e+01)
    Median (IQR) 1.63e+00 (3.38e-01, 1.16e+01) 1.29e+01 (2.02e+00, 4.14e+01) 3.13e+01 (1.51e+01, 4.62e+01) 4.93e+01 (2.95e+01, 6.31e+01)
    Minimum-Maximum 0.0e+00-4.81e+01 0.0e+00-1.14e+02 6.5e-01-8.62e+01 2.0e+01-7.61e+01
Alloisolithocholic Acid (2Ëš BA)
    Mean (SD) 5.90e-04 (7.63e-04) 6.70e-04 (1.04e-03) 6.51e-03 (1.08e-02) 3.00e-02 (3.40e-02)
    Median (IQR) 2.79e-04 (0.00e+00, 7.97e-04) 1.59e-04 (0.00e+00, 1.02e-03) 1.18e-03 (0.00e+00, 6.52e-03) 1.23e-02 (7.41e-03, 3.20e-02)
    Minimum-Maximum 0e+00-2.55e-03 0e+00-5.12e-03 0e+00-4.28e-02 0e+00-1.07e-01
Butyrate (SCFA)
    Mean (SD) 4.19e-01 (3.80e-01) 2.36e+00 (3.22e+00) 5.88e+00 (6.08e+00) 1.53e+01 (1.01e+01)
    Median (IQR) 4.60e-01 (6.00e-02, 5.80e-01) 8.30e-01 (3.05e-01, 3.25e+00) 3.89e+00 (1.99e+00, 6.86e+00) 1.24e+01 (8.87e+00, 2.34e+01)
    Minimum-Maximum 0.0e+00-1.28e+00 0.0e+00-1.22e+01 4.0e-02-2.66e+01 1.7e+00-3.85e+01
Cholic Acid (1Ëš BA)
    Mean (SD) 4.81e-01 (1.33e+00) 8.09e-01 (1.80e+00) 6.09e-01 (2.50e+00) 2.08e-02 (4.12e-02)
    Median (IQR) 2.60e-02 (2.43e-03, 1.83e-01) 1.23e-01 (2.13e-02, 7.06e-01) 5.09e-03 (1.68e-03, 6.89e-02) 5.41e-03 (2.14e-03, 1.21e-02)
    Minimum-Maximum 0.00e+00-7.49e+00 1.10e-03-9.18e+00 0.00e+00-1.27e+01 4.85e-04-1.67e-01
Deoxycholic Acid (2Ëš BA)
    Mean (SD) 6.61e-04 (7.86e-04) 2.61e-02 (1.28e-01) 2.52e-01 (3.31e-01) 1.56e+00 (2.01e+00)
    Median (IQR) 3.53e-04 (3.82e-05, 1.13e-03) 6.62e-04 (5.09e-05, 3.40e-03) 1.01e-01 (4.12e-02, 2.63e-01) 9.39e-01 (4.81e-01, 1.79e+00)
    Minimum-Maximum 0.00e+00-2.75e-03 0.00e+00-8.02e-01 3.33e-03-1.11e+00 1.01e-01-8.87e+00
Glycocholic Acid (1ËšConj. BA)
    Mean (SD) 1.39e-01 (5.16e-01) 1.76e-01 (6.96e-01) 1.42e-03 (3.08e-03) 4.67e-03 (9.85e-03)
    Median (IQR) 4.51e-03 (1.66e-04, 2.43e-02) 2.13e-03 (8.40e-05, 5.12e-02) 2.04e-04 (0.00e+00, 1.50e-03) 5.80e-04 (8.38e-05, 2.90e-03)
    Minimum-Maximum 0e+00-2.93e+00 0e+00-4.31e+00 0e+00-1.45e-02 0e+00-4.17e-02
Lithocholic Acid (2Ëš BA)
    Mean (SD) 3.73e-04 (6.01e-04) 1.96e-02 (7.89e-02) 4.21e-01 (5.52e-01) 9.03e-01 (7.03e-01)
    Median (IQR) 0.00e+00 (0.00e+00, 5.58e-04) 6.37e-04 (1.99e-04, 1.94e-03) 1.46e-01 (4.75e-02, 6.92e-01) 6.88e-01 (4.32e-01, 1.14e+00)
    Minimum-Maximum 0.00e+00-1.75e-03 0.00e+00-4.41e-01 2.37e-03-2.24e+00 1.72e-01-2.86e+00
Propionate (SCFA)
    Mean (SD) 6.58e-01 (1.02e+00) 5.13e+00 (6.45e+00) 1.07e+01 (8.24e+00) 1.63e+01 (7.21e+00)
    Median (IQR) 4.35e-01 (1.38e-01, 6.20e-01) 1.87e+00 (5.15e-01, 7.83e+00) 9.50e+00 (3.95e+00, 1.70e+01) 1.76e+01 (1.04e+01, 2.06e+01)
    Minimum-Maximum 0.00e+00-5.81e+00 0.00e+00-2.39e+01 8.00e-02-2.80e+01 2.96e+00-2.89e+01
Taurocholic Acid (1ËšConj. BA)
    Mean (SD) 5.62e-02 (9.73e-02) 1.10e-01 (2.91e-01) 1.05e-03 (2.16e-03) 9.58e-04 (1.60e-03)
    Median (IQR) 9.42e-03 (2.48e-04, 6.03e-02) 1.32e-03 (1.45e-04, 2.19e-02) 2.91e-05 (0.00e+00, 5.43e-04) 2.33e-04 (0.00e+00, 1.24e-03)
    Minimum-Maximum 0e+00-4.01e-01 0e+00-1.60e+00 0e+00-9.71e-03 0e+00-6.73e-03
gt::gtsave(gtsummary::as_gt(metab_boxplot_summay_stats), file = "./Results/Metab_Quant_Summary_Stats.png",
           vwidth = 1500, vheight = 1000)

set.seed(123) # for consistent jittering of points
gg_metab_boxplot <-
ggboxplot(metab_boxplot,
          x = "diversity_group_abv",
          y = "mvalue__mM",
          fill = "db",
          color = "diversity_group_abv",
          alpha = 0.65,
          outlier.shape = NA,
          facet.by = c("class", "compound")) +
    theme(legend.text = et(size = 12, color = "black"),
          legend.title = et(size = 14, color = "black"),
          axis.title.x = eb(),
          axis.title.y = et(size = 12, color = "black"),
          panel.border = eb(),
          strip.background = er(colour="white", fill="white"),
          ) +
    geom_hline(yintercept = 0) + 
    geom_segment(aes(x = 0.35, y = 0, xend = 0.35, yend = Inf)) +
    facet_wrap(~compound, scales = "fixed") +
    stat_pvalue_manual(metab_boxplot_stats, 
                       tip.length = 0.015) +
  geom_point(
    data = metab_boxplot,
    aes(x = diversity_group_abv, y = mvalue__mM, color = diversity_group_abv),
    position = position_jitter(width = 0.2),
    size = 2,
    alpha = 0.65) +
    scale_fill_manual("Cohort", values = rev(pirate_colors)) +
    scale_color_manual("Diversity Group", values = diversity_group_colors) +
    scale_y_log10(limits = c(0.01, 5000),
                  labels = c(0.01, 0.1, 1, 10, 100, 1000),
                  breaks = c(0.01, 0.1, 1, 10, 100, 1000),
                  expand = expansion(mult = c(0.1, 0.2))) +
    ylab("Concentration (mM)\n")

gg_metab_boxplot

cairo_pdf(filename = "./Results/Supplemental_Figure_2.pdf", width = 14, height = 10, onefile = FALSE)

gg_metab_boxplot

invisible(dev.off())

Figure 5A: Enterococcus Expansion and Infection

gg_ecoc_all_patients <- peri_matrix_all %>%
    distinct(sampleID, .keep_all = T) %>%
    select(sampleID, patientID, bact_infection_present) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Enterococcus", ignore.case = T)) %>%
        count(sampleID, wt = pctseqs, name = "enterococcus_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    ungroup() %>%
    arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = paste0(Phylum, "-", Order, "-", Family, "-",
        Genus)) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    mutate(cum.pct = cumsum(pctseqs), y.text = (cum.pct + c(0,
        cum.pct[-length(cum.pct)]))/2) %>%
    ungroup() %>%
    dplyr::select(-cum.pct) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID),
        Genus = factor(Genus, levels = unique(Genus)), Genus = fct_relevel(Genus,
            "Firmicutes-Lactobacillales-Enterococcaceae-Enterococcus",
            after = Inf)) %>%
    arrange(-enterococcus_rel_abundance) %>%
    filter(bact_infection_present != "No Infection") %>%
    ggplot(aes(x = reorder(sampleID, -enterococcus_rel_abundance),
        y = pctseqs)) + geom_bar(aes(fill = Genus), stat = "identity") +
    theme_bw() + theme(legend.position = "none", axis.text.x = et(angle = 90,
    hjust = 0.5), strip.text.x = eb(), strip.background = eb(),
    axis.title.y = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 10)) + scale_fill_manual(values = metaphlan_pal2) +
    scale_y_continuous(expand = expansion(mult = c(0.005, 0.005))) +
    ylab("Shotgun Relative Abundance (%)\n") + xlab("") + facet_grid(~bact_infection_present,
    space = "free_x", scales = "free_x")
# gg_ecoc_all_patients


# Discrete heatmap of infections
ecoc_infx_orgs <- peri_criteria_all %>%
    filter(sampleID %in% peri_matrix_all$sampleID) %>%
    group_by(patientID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Enterococcus", ignore.case = T)) %>%
        count(sampleID, wt = pctseqs, name = "enterococcus_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    group_by(patientID, sampleID, organisms, org_presence) %>%
    dplyr::slice(1) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID)) %>%
    mutate(org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present,
        ignore.case = T), 0, org_presence)) %>%
    ungroup() %>%
    mutate(organisms = ifelse(bact_infection_present == "Yes" &
        org_presence == 0, "Other", organisms)) %>%
    arrange(-enterococcus_rel_abundance) %>%
    mutate(organisms = ifelse(grepl(x = organisms, pattern = "enterococcus|klebsiella|escherichia|proteus|citrobacter|culture",
        ignore.case = TRUE), organisms, "Other Bacterial Infection"),
        organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection",
            organisms))

ecoc_infx_orgs_order <- ecoc_infx_orgs %>%
    group_by(sampleID, patientID, organisms) %>%
    distinct(sampleID, patientID, .keep_all = T) %>%
    ungroup() %>%
    mutate(organisms = ifelse(grepl(organisms, pattern = "gallinarum"),
        "Other Bacterial Infection", organisms), org_colors = case_when(grepl(x = organisms,
        pattern = "enterococcus faecium", ignore.case = T) ~
        1, grepl(x = organisms, pattern = "enterococcus faecalis",
        ignore.case = T) ~ 2, grepl(x = organisms, pattern = "enterococcus avium",
        ignore.case = T) ~ 3, grepl(x = organisms, pattern = "klebsiella pneumoniae",
        ignore.case = T) ~ 4, grepl(x = organisms, pattern = "escherichia coli",
        ignore.case = T) ~ 5, grepl(x = organisms, pattern = "proteus mirabilis",
        ignore.case = T) ~ 6, grepl(x = organisms, pattern = "citrobacter freundii",
        ignore.case = T) ~ 7, grepl(x = organisms, pattern = "other bacterial infection|gallinarum",
        ignore.case = T) ~ 8, grepl(x = organisms, pattern = "culture negative",
        ignore.case = T) ~ 9, TRUE ~ 0), organisms = as.factor(organisms),
        organisms = factor(organisms, levels = c("Enterococcus faecium",
            "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
            "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
            "Other Bacterial Infection", "Culture Negative",
            "No Bacterial Infection")), org_colors = factor(org_colors,
            levels = c("1", "2", "3", "4", "5", "6", "7", "8",
                "9", "0"))) %>%
    left_join(ecoc_infx_orgs) %>%
    mutate(organisms = factor(organisms, levels = c("Enterococcus faecium",
        "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
        "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
        "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")),
        org_colors = factor(org_colors, levels = c("1", "2",
            "3", "4", "5", "6", "7", "8", "9", "0"))) %>%
    drop_na(organisms) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    ungroup() %>%
    filter(bact_infection_present == "Infection") %>%
    droplevels()

gg_ecoc_infx_orgs <- ecoc_infx_orgs_order %>%
    ggplot(., aes(x = reorder(sampleID, -enterococcus_rel_abundance),
        y = organisms, fill = as.factor(org_colors))) + geom_tile(color = "black") +
    theme_bw() + theme(axis.title.y = et(color = "black", size = 12),
    axis.text.y = et(color = "black", size = 10), axis.ticks.x = eb(),
    axis.text.x = eb(), axis.title.x = eb(), panel.grid = eb(),
    strip.text = eb()) + scale_fill_manual(values = c("#129246",
    "#0C7A3A", "#08592B", "#FF0000", "#CC0404", "#8A0202", "#5C0202",
    "#E6C66E", "#BD992D", "#00000000"), labels = c("Enterococcus faecium",
    "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
    "Escherichia coli", "Citrobacter freundii", "Proteus mirabilis",
    "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")) +
    labs(fill = "Infecting Organism", y = "Infecting Organism\n") +
    scale_y_discrete(expand = expansion(mult = c(0.005, 0.005)),
        limits = rev(levels(ecoc_infx_orgs_order$organisms))) +
    facet_grid(. ~ bact_infection_present, space = "free_x",
        scales = "free_x")

# gg_ecoc_infx_orgs

yingtools2::gg.stack(gg_ecoc_all_patients, gg_ecoc_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

pdf(file = "./Results/Figure_5A.pdf", height = 8, width = 10,
    onefile = F)

yingtools2::gg.stack(gg_ecoc_all_patients, gg_ecoc_infx_orgs,
    heights = c(1, 0.5), adjust.themes = TRUE)

invisible(dev.off())

# Save object for combining later
fig_5a <- yingtools2::gg.stack(gg_ecoc_all_patients + theme(plot.margin = margin(t = 5,
    r = 5, b = 5, l = 5)), gg_ecoc_infx_orgs + theme(legend.position = "none",
    plot.margin = margin(t = 5, r = 5, b = 5, l = 5)), heights = c(1,
    0.5), adjust.themes = TRUE, as.gtable = TRUE)

Figure 5C: Enterobacterales Expansion and Infection

gg_ebac_all_patients <- peri_matrix_all %>%
    distinct(sampleID, .keep_all = T) %>%
    select(sampleID, patientID, bact_infection_present) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Klebsiella pneumoniae|Escherichia coli|Citrobacter freundii|Proteus mirabilis")) %>%
        count(sampleID, wt = pctseqs, name = "enterobacterales_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    ungroup() %>%
    arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = paste0(Phylum, "-", Order, "-", Family, "-",
        Genus)) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    mutate(cum.pct = cumsum(pctseqs), y.text = (cum.pct + c(0,
        cum.pct[-length(cum.pct)]))/2) %>%
    ungroup() %>%
    dplyr::select(-cum.pct) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID),
        Genus = factor(Genus, levels = unique(Genus)), Genus = fct_relevel(Genus,
            c("Proteobacteria-Enterobacterales-Enterobacteriaceae-Citrobacter",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Enterobacter",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Escherichia",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Klebsiella",
                "Proteobacteria-Enterobacterales-Morganellaceae-Proteus"),
            after = Inf)) %>%
    arrange(-enterobacterales_rel_abundance) %>%
    filter(bact_infection_present == "Infection") %>%
    ggplot(aes(x = reorder(sampleID, -enterobacterales_rel_abundance),
        y = pctseqs)) + geom_bar(aes(fill = Genus), stat = "identity") +
    theme_bw() + theme(legend.position = "none", axis.text.x = et(angle = 90,
    hjust = 0.5), strip.text.x = eb(), strip.background = eb(),
    axis.title.y = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 10)) + scale_fill_manual(values = metaphlan_pal2) +
    scale_y_continuous(expand = expansion(mult = c(0.005, 0.005))) +
    ylab("Shotgun Relative Abundance (%)\n") + xlab("") + facet_grid(~bact_infection_present,
    space = "free_x", scales = "free_x")
# gg_ebac_all_patients

# Discrete heatmap of infections
ebac_infx_orgs <- peri_criteria_all %>%
    filter(sampleID %in% peri_matrix_all$sampleID) %>%
    group_by(patientID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Klebsiella pneumoniae|Escherichia coli|Citrobacter freundii|Proteus mirabilis")) %>%
        count(sampleID, wt = pctseqs, name = "enterobacterales_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    group_by(patientID, sampleID, organisms, org_presence) %>%
    dplyr::slice(1) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID)) %>%
    mutate(org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present,
        ignore.case = T), 0, org_presence)) %>%
    ungroup() %>%
    mutate(organisms = ifelse(bact_infection_present == "Yes" &
        org_presence == 0, "Other", organisms)) %>%
    arrange(-enterobacterales_rel_abundance) %>%
    mutate(organisms = ifelse(grepl(x = organisms, pattern = "enterococcus|klebsiella|escherichia|proteus|citrobacter|culture",
        ignore.case = TRUE), organisms, "Other Bacterial Infection"),
        organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection",
            organisms))

ebac_infx_orgs_order <- ebac_infx_orgs %>%
    group_by(sampleID, patientID, organisms) %>%
    distinct(sampleID, patientID, .keep_all = T) %>%
    ungroup() %>%
    mutate(organisms = ifelse(grepl(organisms, pattern = "gallinarum"),
        "Other Bacterial Infection", organisms), org_colors = case_when(grepl(x = organisms,
        pattern = "enterococcus faecium", ignore.case = T) ~
        1, grepl(x = organisms, pattern = "enterococcus faecalis",
        ignore.case = T) ~ 2, grepl(x = organisms, pattern = "enterococcus avium",
        ignore.case = T) ~ 3, grepl(x = organisms, pattern = "klebsiella pneumoniae",
        ignore.case = T) ~ 4, grepl(x = organisms, pattern = "escherichia coli",
        ignore.case = T) ~ 5, grepl(x = organisms, pattern = "proteus mirabilis",
        ignore.case = T) ~ 6, grepl(x = organisms, pattern = "citrobacter freundii",
        ignore.case = T) ~ 7, grepl(x = organisms, pattern = "other bacterial infection|gallinarum",
        ignore.case = T) ~ 8, grepl(x = organisms, pattern = "culture negative",
        ignore.case = T) ~ 9, TRUE ~ 0), organisms = as.factor(organisms),
        organisms = factor(organisms, levels = c("Klebsiella pneumoniae",
            "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
            "Enterococcus faecium", "Enterococcus faecalis",
            "Enterococcus avium", "Other Bacterial Infection",
            "Culture Negative", "No Bacterial Infection")), org_colors = factor(org_colors,
            levels = c("4", "5", "6", "7", "1", "2", "3", "8",
                "9", "0"))) %>%
    left_join(ebac_infx_orgs) %>%
    mutate(organisms = factor(organisms, levels = c("Klebsiella pneumoniae",
        "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
        "Enterococcus faecium", "Enterococcus faecalis", "Enterococcus avium",
        "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")),
        org_colors = factor(org_colors, levels = c("4", "5",
            "6", "7", "1", "2", "3", "8", "9", "0"))) %>%
    drop_na(organisms) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    ungroup() %>%
    filter(bact_infection_present == "Infection") %>%
    droplevels()

gg_ebac_infx_orgs <- ebac_infx_orgs_order %>%
    ggplot(., aes(x = reorder(sampleID, -enterobacterales_rel_abundance),
        y = organisms, fill = as.factor(org_colors))) + geom_tile(color = "black") +
    theme_bw() + theme(axis.title.y = et(color = "black", size = 12),
    axis.text.y = et(color = "black", size = 10), axis.ticks.x = eb(),
    axis.text.x = eb(), axis.title.x = eb(), panel.grid = eb(),
    strip.text = eb()) + scale_fill_manual(values = c("#FF0000",
    "#CC0404", "#8A0202", "#5C0202", "#129246", "#0C7A3A", "#08592B",
    "#E6C66E", "#BD992D", "#00000000"), labels = c("Klebsiella pneumoniae",
    "Escherichia coli", "Citrobacter freundii", "Proteus mirabilis",
    "Enterococcus faecium", "Enterococcus faecalis", "Enterococcus avium",
    "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")) +
    labs(fill = "Infecting Organism", y = "Infecting Organism\n") +
    scale_y_discrete(expand = expansion(mult = c(0.005, 0.005)),
        limits = rev(levels(ebac_infx_orgs_order$organisms))) +
    facet_grid(. ~ bact_infection_present, space = "free_x",
        scales = "free_x")

# gg_ebac_infx_orgs

yingtools2::gg.stack(gg_ebac_all_patients, gg_ebac_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

pdf(file = "./Results/Figure_5C.pdf", height = 8, width = 10,
    onefile = F)

yingtools2::gg.stack(gg_ebac_all_patients, gg_ebac_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

invisible(dev.off())

# Save object for combining later
fig_5c <- yingtools2::gg.stack(gg_ebac_all_patients + theme(plot.margin = margin(t = 5,
    r = 5, b = 5, l = 5)), gg_ebac_infx_orgs + theme(legend.position = "none",
    plot.margin = margin(t = 5, r = 5, b = 5, l = 5)), heights = c(1,
    0.5), adjust.themes = TRUE, as.gtable = TRUE)

Combine Figure 5A+B+C+D

pdf(file = "./Results/Figure_5ABCD.pdf", height = 16, width = 16,
    onefile = F)
cowplot::plot_grid(fig_5a, fig_5b + theme(plot.margin = margin(t = 5,
    r = 5, b = 25, l = 5)), fig_5c, fig_5d + theme(plot.margin = margin(t = 5,
    r = 5, b = 25, l = 5)), align = "h", axis = "bl", ncol = 2,
    rel_widths = c(1, 0.9))
invisible(dev.off())

Quant Metabolomics-Enterococcus Expansion

# Qual compounds that we can quantify and that are also
# significant in the volcano analysis for both
signif_compounds <- qual_tot_ecoc_expan %>%
    rownames_to_column(var = "compound") %>%
    filter(p.adj <= 0.05 & abs(log2fc_val) > 0.75) %>%
    distinct(compound)

metab_boxplot_ecoc_expan <- peri_matrix_all %>%
    select(sampleID, enterococcus_rel_abundance) %>%
    left_join(sample_lookup) %>%
    group_by(sampleID) %>%
    slice(1) %>%
    left_join(metab_quant_converted %>%
        filter(compound %in% signif_compounds$compound)) %>%
    filter(compound %!in% c("Kynurenine", "Anthranilic Acid")) %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound)) %>%
    ungroup() %>%
    mutate(db = ifelse(db == "HealthyDonor", "Healthy Donor",
        "Liver Transplant") %>%
        factor(., levels = c("Healthy Donor", "Liver Transplant")),
        class = case_when(compound %in% c("Taurocholic Acid",
            "Glycocholic Acid") ~ "Conjugated Primary Bile Acid",
            compound %in% c("Cholic Acid") ~ "Primary Bile Acid",
            compound %in% c("3-Oxolithocholic Acid", "Alloisolithocholic Acid",
                "Deoxycholic Acid", "Isodeoxycholic Acid", "Lithocholic Acid") ~
                "Secondary Bile Acid", compound %in% c("Threonine",
                "Glycine", "Tyrosine", "Tyramine", "Serine",
                "Leucine", "Isoleucine", "Valine", "Phenylalanine",
                "Alanine", "Proline", "Aspartate", "Methionine",
                "Glutamate", "Lysine", "Cysteine", "Tryptophan") ~
                "Amino Acid", compound %in% c("Acetate", "Butyrate",
                "Succinate", "Propionate") ~ "Short-Chain Fatty Acid",
            compound %in% c("Kynurenic Acid", "Anthranilic Acid",
                "Kynurenine", "Tryptamine") ~ "Kynurenine Metabolite",
            compound == "Desaminotyrosine" ~ "Phenolic Aromatic",
            compound == "Niacin" ~ "B-Vitamin", TRUE ~ "Indole")) %>%
    drop_na() %>%
    group_by(compound) %>%
    mutate(class = factor(class, levels = c("Conjugated Primary Bile Acid",
        "Primary Bile Acid", "Secondary Bile Acid", "Short-Chain Fatty Acid",
        "Amino Acid", "Phenolic Aromatic", "Indole", "Kynurenine Metabolite",
        "B-Vitamin")), compound = case_when(class == "Conjugated Primary Bile Acid" ~
        paste(compound, "(1ËšConj. BA)"), class == "Primary Bile Acid" ~
        paste(compound, "(1Ëš BA)"), class == "Secondary Bile Acid" ~
        paste(compound, "(2Ëš BA)"), class == "Short-Chain Fatty Acid" ~
        paste(compound, "(SCFA)"), class == "Amino Acid" ~ paste(compound,
        "(AA)"), class == "Phenolic Aromatic" ~ paste(compound,
        "(Phen. Arom.)"), class == "Indole" ~ paste0(compound,
        "(Indole)"), class == "Kynurenine Metabolite" ~ paste(compound,
        "(Kyn. Metab.)"), class == "B-Vitamin" ~ paste(compound,
        "(B-Vitamin)")), compound = factor(compound, levels = c("Acetate (SCFA)",
        "Butyrate (SCFA)", "Propionate (SCFA)", "Succinate (SCFA)",
        "Taurocholic Acid (1ËšConj. BA)", "Glycocholic Acid (1ËšConj. BA)",
        "Cholic Acid (1Ëš BA)", "3-Oxolithocholic Acid (2Ëš BA)",
        "Alloisolithocholic Acid (2Ëš BA)", "Deoxycholic Acid (2Ëš BA)",
        "Isodeoxycholic Acid (2Ëš BA)", "Lithocholic Acid (2Ëš BA)",
        "Kynurenic Acid (Kyn. Metab.)", "Kynurenine (Kyn. Metab.)",
        "Anthranilic Acid (Kyn. Metab.)", "Desaminotyrosine",
        "Niacin", "Tyrosine", "Tryptamine", "Tryptophan", "Phenylalanine"))) %>%
    ungroup() %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue__mM[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue__mM[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue__mM != 0)) %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_expansion ==
        "1", "Expansion", "No Expansion"))


metab_boxplot_summary_stats_ecoc_expan <- metab_boxplot_ecoc_expan %>%
    group_by(compound) %>%
    summarise(y.position = max(mvalue__mM) * 1.1)

metab_boxplot_stats_ecoc_expan <- metab_boxplot_ecoc_expan %>%
    group_by(class, compound) %>%
    rstatix::wilcox_test(mvalue__mM ~ enterococcus_expansion) %>%
    rstatix::adjust_pvalue(method = "BH") %>%
    rstatix::add_significance("p.adj") %>%
    mutate(p.adj = ifelse(p.adj < 0.001, "p.adj < 0.001", paste("p.adj = ",
        round(p.adj, 3)))) %>%
    add_xy_position(x = "enterococcus_expansion") %>%
    select(-y.position) %>%
    left_join(metab_boxplot_summary_stats_ecoc_expan)


# gg_metab_boxplot_ecoc_expan <-
# ggboxplot(metab_boxplot_ecoc_expan, x =
# 'enterococcus_expansion', y = 'mvalue__mM', fill =
# 'enterococcus_expansion', alpha = 0.65, outlier.shape =
# NA ) + theme(legend.text = et(size = 14, color =
# 'black'), legend.title = et(size = 14, color = 'black'),
# axis.title.x = eb(), axis.title.y = et(size = 16, color =
# 'black'), strip.text = et(size = 14, color = 'black'),
# strip.background = eb(), panel.border = eb(), axis.line =
# eb()) + facet_wrap(~compound, scales = 'free_y', nrow =
# 2) + annotate('segment', x = 0.35, xend = 0.35, y = 0,
# yend = Inf, colour = 'black', linewidth = 1) +
# annotate('segment', x = 0.35, xend = Inf, y = 0, yend =
# 0, colour = 'black') +
# stat_pvalue_manual(metab_boxplot_stats_ecoc_expan, label
# = '{p.adj}', bracket.size = 0.5) + geom_point(data =
# metab_boxplot_ecoc_expan, aes(x = enterococcus_expansion,
# y = mvalue__mM, fill = enterococcus_expansion), position
# = position_jitter(width = 0.2), size = 2, shape = 21,
# alpha = 0.65, color = 'black') +
# scale_fill_manual('Enterococcus', values = c('gray87',
# '#389458')) + scale_y_continuous(expand = expansion(mult
# = c(0.1, 0.2))) + ylab('Concentration (mM)\n')
# gg_metab_boxplot_ecoc_expan cairo_pdf(filename =
# './Results/Figure_5B.pdf', height = 8, width = 16,
# onefile = TRUE) gg_metab_boxplot_ecoc_expan
# invisible(dev.off())

Figure 3A: sPLS-DA Diversity Groups and Healthy Donors using Qualitative Metabolomics

diversity_metab_mat <-
  metab_qual_anon %>% filter(sampleID %in% first_samps$sampleID) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group_abv)) %>% 
  group_by(sampleID, compound, diversity_group_abv) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue, diversity_group_abv) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`, - diversity_group_abv) %>% 
  filter_all(any_vars(!is.na(.)))

diversity_metab_labs <-
  diversity_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group_abv)) %>% 
  droplevels() %>% 
  pull(diversity_group_abv)

dim(diversity_metab_mat) #101 93 (means 93 compounds and 101 LT patients/infections)
## [1] 101  93
length(diversity_metab_labs) #101 (means 101 LT patients/infections)
## [1] 101
# Begin model training
set.seed(1234)
diversity_train <- sample(1:nrow(diversity_metab_mat), as.integer(0.7*nrow(diversity_metab_mat))) # randomly select 70% samples in training
diversity_test <- setdiff(1:nrow(diversity_metab_mat), diversity_train) # rest is part of the test set

# store matrices into training and test set:
diversity_metab_mat.train <- diversity_metab_mat[diversity_train, ]
diversity_metab_mat.test <- diversity_metab_mat[diversity_test,]
diversity_metab_labs.train <- diversity_metab_labs[diversity_train]
diversity_metab_labs.test <- diversity_metab_labs[diversity_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
diversity_train_splsda <- mixOmics::splsda(diversity_metab_mat.train, diversity_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
diversity_train_plsda_perf <-
  perf(
    diversity_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  diversity_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 4 might be best for classification error rate and max.dist

# Number of optimal variables to select for each component
diversity_train_keepX <- c(1:10,  seq(20, 130, 10))

set.seed(123)
diversity_train_tune_splsda <-
  mixOmics::tune.splsda(
    diversity_metab_mat.train,
    diversity_metab_labs.train,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = diversity_train_keepX,
    nrepeat = 50
  )

plot(diversity_train_tune_splsda, col = color.jet(4))

diversity_train_error <- diversity_train_tune_splsda$error.rate
diversity_train_ncomp <- diversity_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
diversity_train_ncomp #1 component is optimal
## [1] 1
diversity_train_select_keepX <- diversity_train_tune_splsda$choice.keepX[1:ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp)]  # optimal number of variables to select per component
diversity_train_select_keepX
## comp1 comp2 
##    80     8
# Final Model
diversity_train_splsda_final <-
  mixOmics::splsda(diversity_metab_mat.train, diversity_metab_labs.train, ncomp = ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp), keepX = diversity_train_select_keepX)

# Test the model
diversity_predict_train_splsda_final <- predict(diversity_train_splsda_final, diversity_metab_mat.test, 
                                dist = "max.dist")

diversity_predict_train_comp2 <- diversity_predict_train_splsda_final$class$max.dist[,ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp)]

diversity_union <- union(diversity_predict_train_comp2, diversity_metab_labs.test)

confusionMatrix(table(factor(diversity_predict_train_comp2, diversity_union,
                             levels = c("Low", "Medium", "High")),
                      factor(diversity_metab_labs.test, diversity_union,
                             levels = c("Low", "Medium", "High"))))
## Confusion Matrix and Statistics
## 
##         
##          Medium High Low
##   Medium     10    4   0
##   High        3    6   3
##   Low         0    0   5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6774          
##                  95% CI : (0.4863, 0.8332)
##     No Information Rate : 0.4194          
##     P-Value [Acc > NIR] : 0.003321        
##                                           
##                   Kappa : 0.4992          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Medium Class: High Class: Low
## Sensitivity                 0.7692      0.6000     0.6250
## Specificity                 0.7778      0.7143     1.0000
## Pos Pred Value              0.7143      0.5000     1.0000
## Neg Pred Value              0.8235      0.7895     0.8846
## Prevalence                  0.4194      0.3226     0.2581
## Detection Rate              0.3226      0.1935     0.1613
## Detection Prevalence        0.4516      0.3871     0.1613
## Balanced Accuracy           0.7735      0.6571     0.8125
diversity_train_background <- background.predict(diversity_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
diversity_tot <- predict(diversity_train_splsda_final,
                        diversity_metab_mat,
                        dist = "max.dist")

diversity_tot_predict <- diversity_tot$class$max.dist[,ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp)]

diversity_tot_union <- union(diversity_tot_predict, diversity_metab_labs)

diversity_cm <- confusionMatrix(table(factor(diversity_tot_predict, diversity_tot_union,
                                        levels = c("Low", "Medium", "High")),
                      factor(diversity_metab_labs, diversity_tot_union,
                             levels = c("Low", "Medium", "High"))),
                        )
diversity_cm
## Confusion Matrix and Statistics
## 
##         
##          Medium Low High
##   Medium     30  11    2
##   Low         6  27    6
##   High        0   1   18
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7426         
##                  95% CI : (0.646, 0.8244)
##     No Information Rate : 0.3861         
##     P-Value [Acc > NIR] : 3.744e-13      
##                                          
##                   Kappa : 0.6044         
##                                          
##  Mcnemar's Test P-Value : 0.07057        
## 
## Statistics by Class:
## 
##                      Class: Medium Class: Low Class: High
## Sensitivity                 0.8333     0.6923      0.6923
## Specificity                 0.8000     0.8065      0.9867
## Pos Pred Value              0.6977     0.6923      0.9474
## Neg Pred Value              0.8966     0.8065      0.9024
## Prevalence                  0.3564     0.3861      0.2574
## Detection Rate              0.2970     0.2673      0.1782
## Detection Prevalence        0.4257     0.3861      0.1881
## Balanced Accuracy           0.8167     0.7494      0.8395
# Additional model measures
diversity_epi <- mltest::ml_test(predicted = factor(diversity_tot_predict, levels = c("Low", "Medium", "High")),
                                 true = factor(diversity_metab_labs, levels = c("Low", "Medium", "High")))

diversity_cm_names <- diversity_cm$table
colnames(diversity_cm_names) <- c("Actual\nLow", "Actual\nMedium", "Actual\nHigh")
rownames(diversity_cm_names) <- c("Predicted\nLow", "Predicted\nMedium", "Predicted\nHigh")

diversity_confusion_df <- diversity_cm_names %>% 
  t() 

{
pdf(file = "./Results/Figure_3A.pdf", height = 10, width = 10)
plotIndiv(
  diversity_train_splsda_final,
  xlim = c(min(diversity_train_splsda_final$variates$X[,1])*1.05, max(diversity_train_splsda_final$variates$X[,1])*1.8),
  ylim = c(min(diversity_train_splsda_final$variates$X[,2])*1.15, max(diversity_train_splsda_final$variates$X[,2])*1.8),
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = diversity_train_background,
  col = c("#3A001E", "#8A0246", "#C20463"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Diversity Group",
  X.label = paste0("Component 1 (", round(diversity_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(diversity_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = -5.25,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  diversity_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -5.5,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  paste0("Overall ACC = ", 
                    paste0(formatC(round(diversity_cm$overall[1], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(diversity_cm$overall[3], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(diversity_cm$overall[4], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)

text(
  y = 3,
  x = 1.5,
  paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.7,
  x = 1.5,
  paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.4,
  x = 1.5,
  paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
# text(
#   y = 2.1,
#   x = 1.5,
#   paste0("Low Diversity OR = ", round(diversity_epi$DOR[1], 1)),
#     cex = 0.75, adj = 0, col = "#3A001E")

text(
  y = -4.7,
  x = 0,
  paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5,
  x = 0,
  paste0("Medium Diversity Sens. = ", round(diversity_epi$precision[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.3,
  x = 0,
  paste0("Medium Diversity Spec. = ", round(diversity_epi$specificity[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
# text(
#   y = -5.9,
#   x = 1.5,
#   paste0("Medium Diversity OR = ", round(diversity_epi$DOR[2], 1)),
#     cex = 0.75, adj = 0, col = "#8A0246")

text(
  y = 3,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.7,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Sens. = ", round(diversity_epi$precision[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.4,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
# text(
#   y = 2.1,
#   x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
#   paste0("High Diversity OR = ", round(diversity_epi$DOR[3], 1)),
#     cex = 0.75, adj = 0, col = "#C20463")

invisible(dev.off())
}

plotIndiv(
  diversity_train_splsda_final,
  xlim = c(min(diversity_train_splsda_final$variates$X[,1])*1.05, max(diversity_train_splsda_final$variates$X[,1])*1.8),
  ylim = c(min(diversity_train_splsda_final$variates$X[,2])*1.15, max(diversity_train_splsda_final$variates$X[,2])*1.8),
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = diversity_train_background,
  col = c("#3A001E", "#8A0246", "#C20463"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Diversity Group",
  X.label = paste0("Component 1 (", round(diversity_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(diversity_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = -5.25,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  diversity_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -5.5,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  paste0("Overall ACC = ", 
                    paste0(formatC(round(diversity_cm$overall[1], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(diversity_cm$overall[3], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(diversity_cm$overall[4], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)

text(
  y = 3,
  x = 1.5,
  paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.7,
  x = 1.5,
  paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.4,
  x = 1.5,
  paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
# text(
#   y = 2.1,
#   x = 1.5,
#   paste0("Low Diversity OR = ", round(diversity_epi$DOR[1], 1)),
#     cex = 0.75, adj = 0, col = "#3A001E")

text(
  y = -4.7,
  x = 0,
  paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5,
  x = 0,
  paste0("Medium Diversity Sens. = ", round(diversity_epi$precision[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.3,
  x = 0,
  paste0("Medium Diversity Spec. = ", round(diversity_epi$specificity[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
# text(
#   y = -5.9,
#   x = 1.5,
#   paste0("Medium Diversity OR = ", round(diversity_epi$DOR[2], 1)),
#     cex = 0.75, adj = 0, col = "#8A0246")

text(
  y = 3,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.7,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Sens. = ", round(diversity_epi$precision[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.4,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")

# text(
#   y = 2.1,
#   x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
#   paste0("High Diversity OR = ", round(diversity_epi$DOR[3], 1)),
#     cex = 0.75, adj = 0, col = "#C20463")

Figure 6A: sPLS-DA Enterococcus Expansion using Qualitative Metabolomics

ecoc_doms_metab_mat <-
  metab_qual_anon %>%
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
       compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  group_by(sampleID, compound) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  right_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             # enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  select(-domination) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`) %>% 
  filter_all(any_vars(!is.na(.)))

  
ecoc_doms_metab_labs <-
  ecoc_doms_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             # enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1] 
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  pull(domination)

dim(ecoc_doms_metab_mat) #107 93 (means 93 compounds and 108 LT patients)
## [1] 107  93
length(ecoc_doms_metab_labs) #107 (means 108 LT patients)
## [1] 107
# Begin model training
set.seed(1234)
ecoc_doms_train <- sample(1:nrow(ecoc_doms_metab_mat), as.integer(0.7*nrow(ecoc_doms_metab_mat))) # randomly select 70% samples in training
ecoc_doms_test <- setdiff(1:nrow(ecoc_doms_metab_mat), ecoc_doms_train) # rest is part of the test set

# store matrices into training and test set:
ecoc_doms_metab_mat.train <- ecoc_doms_metab_mat[ecoc_doms_train, ]
ecoc_doms_metab_mat.test <- ecoc_doms_metab_mat[ecoc_doms_test,]
ecoc_doms_metab_labs.train <- ecoc_doms_metab_labs[ecoc_doms_train]
ecoc_doms_metab_labs.test <- ecoc_doms_metab_labs[ecoc_doms_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
ecoc_doms_train_splsda <- mixOmics::splsda(ecoc_doms_metab_mat.train, ecoc_doms_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
ecoc_doms_train_plsda_perf <-
  perf(
    ecoc_doms_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  ecoc_doms_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 1 or 4 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
ecoc_doms_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
ecoc_doms_train_tune_splsda <-
  mixOmics::tune.splsda(
    ecoc_doms_metab_mat.train,
    ecoc_doms_metab_labs.train,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = ecoc_doms_train_keepX,
    nrepeat = 50
  )

plot(ecoc_doms_train_tune_splsda, col = color.jet(4))

ecoc_doms_train_error <- ecoc_doms_train_tune_splsda$error.rate
ecoc_doms_train_ncomp <- ecoc_doms_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
# ecoc_doms_train_ncomp = 4 #4 components are optimal via visual inspection

ecoc_doms_train_select_keepX <- ecoc_doms_train_tune_splsda$choice.keepX[1:ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp)]  # optimal number of variables to select per component
ecoc_doms_train_select_keepX
## comp1 comp2 
##     1    20
# Final Model
ecoc_doms_train_splsda_final <-
  mixOmics::splsda(ecoc_doms_metab_mat.train, ecoc_doms_metab_labs.train, ncomp = ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp), keepX = ecoc_doms_train_select_keepX)

# Test the model
ecoc_doms_predict_train_splsda_final <- predict(ecoc_doms_train_splsda_final, ecoc_doms_metab_mat.test, 
                                dist = "max.dist")

ecoc_doms_predict_train_comp2 <- ecoc_doms_predict_train_splsda_final$class$max.dist[,ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp)]

ecoc_doms_union <- union(ecoc_doms_predict_train_comp2, ecoc_doms_metab_labs.test)

confusionMatrix(table(factor(ecoc_doms_predict_train_comp2, ecoc_doms_union),
                      factor(ecoc_doms_metab_labs.test, ecoc_doms_union)),
                 negative = "0")
## Confusion Matrix and Statistics
## 
##               
##                No Expansion Expansion
##   No Expansion           16         5
##   Expansion               3         9
##                                           
##                Accuracy : 0.7576          
##                  95% CI : (0.5774, 0.8891)
##     No Information Rate : 0.5758          
##     P-Value [Acc > NIR] : 0.02392         
##                                           
##                   Kappa : 0.4943          
##                                           
##  Mcnemar's Test P-Value : 0.72367         
##                                           
##             Sensitivity : 0.8421          
##             Specificity : 0.6429          
##          Pos Pred Value : 0.7619          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.5758          
##          Detection Rate : 0.4848          
##    Detection Prevalence : 0.6364          
##       Balanced Accuracy : 0.7425          
##                                           
##        'Positive' Class : No Expansion    
## 
ecoc_doms_train_background <- background.predict(ecoc_doms_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
ecoc_doms_tot <- predict(ecoc_doms_train_splsda_final,
                        ecoc_doms_metab_mat,
                        dist = "max.dist")

ecoc_doms_tot_predict <- ecoc_doms_tot$class$max.dist[,ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp)]

ecoc_doms_tot_union <- union(ecoc_doms_tot_predict, ecoc_doms_metab_labs)

ecoc_doms_cm <- confusionMatrix(table(factor(ecoc_doms_tot_predict, ecoc_doms_tot_union,
                                        levels = c("No Expansion","Expansion")),
                      factor(ecoc_doms_metab_labs, ecoc_doms_tot_union,
                             levels = c("No Expansion","Expansion"))),
                      positive = "Expansion")
ecoc_doms_cm
## Confusion Matrix and Statistics
## 
##               
##                No Expansion Expansion
##   No Expansion           59        14
##   Expansion               7        27
##                                           
##                Accuracy : 0.8037          
##                  95% CI : (0.7158, 0.8742)
##     No Information Rate : 0.6168          
##     P-Value [Acc > NIR] : 2.546e-05       
##                                           
##                   Kappa : 0.5709          
##                                           
##  Mcnemar's Test P-Value : 0.1904          
##                                           
##             Sensitivity : 0.6585          
##             Specificity : 0.8939          
##          Pos Pred Value : 0.7941          
##          Neg Pred Value : 0.8082          
##              Prevalence : 0.3832          
##          Detection Rate : 0.2523          
##    Detection Prevalence : 0.3178          
##       Balanced Accuracy : 0.7762          
##                                           
##        'Positive' Class : Expansion       
## 
# Additional model measures
ecoc_doms_epi <- epiR::epi.tests(table(ecoc_doms_tot_predict, ecoc_doms_metab_labs), conf.level = 0.95)

ecoc_doms_confusion_df <- ecoc_doms_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nExpansion",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Expansion",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nExpansion" = "Test +",
                "Predicted\nNo Expansion" = "Test -") %>% 
  column_to_rownames(var = "actual")



{
pdf(file = "./Results/Figure_6A.pdf", height = 10, width = 10)
plotIndiv(
  ecoc_doms_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ecoc_doms_train_background,
  col = c("#0C7A3A", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*1.05,
  x = min(ecoc_doms_train_splsda_final$variates$X[,1])*1.15,
  ecoc_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -3.5,
  x = 1.5 ,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#0C7A3A")
text(
  y = 3 ,
  x = 0.75,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.825,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("ACC = ", 
                    round(ecoc_doms_cm$overall[1]*100, 1), "% [",
                    round(ecoc_doms_cm$overall[3]*100, 1), "%, ",
                    round(ecoc_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.875,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Sens. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.925,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Spec. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.975,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("OR = ",
                    formatC(round(ecoc_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ecoc_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ecoc_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

invisible(dev.off())
}

plotIndiv(
  ecoc_doms_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ecoc_doms_train_background,
  col = c("#0C7A3A", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*1.05,
  x = min(ecoc_doms_train_splsda_final$variates$X[,1])*1.15,
  ecoc_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -3.5,
  x = 1.5 ,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#0C7A3A")
text(
  y = 3 ,
  x = 0.75,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.825,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("ACC = ", 
                    round(ecoc_doms_cm$overall[1]*100, 1), "% [",
                    round(ecoc_doms_cm$overall[3]*100, 1), "%, ",
                    round(ecoc_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.875,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Sens. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.925,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Spec. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.975,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("OR = ",
                    formatC(round(ecoc_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ecoc_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ecoc_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

Figure 6C: sPLS-DA Enterobacterales Expansion using Qualitative Metabolomics

ebac_doms_metab_mat <-
  metab_qual_anon %>%
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
       compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  group_by(sampleID, compound) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  right_join(peri_matrix_all %>% 
             mutate(domination = case_when(
               #enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  select(-domination) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`) %>% 
  filter_all(any_vars(!is.na(.)))

  
ebac_doms_metab_labs <-
  ebac_doms_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(peri_matrix_all %>% 
             mutate(domination = case_when(
               # enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  pull(domination)

dim(ebac_doms_metab_mat) #107 93 (means 93 compounds and 108 LT patients)
## [1] 107  93
length(ebac_doms_metab_labs) #107 (means 108 LT patients)
## [1] 107
# Begin model training
set.seed(1234)
ebac_doms_train <- sample(1:nrow(ebac_doms_metab_mat), as.integer(0.7*nrow(ebac_doms_metab_mat))) # randomly select 70% samples in training
ebac_doms_test <- setdiff(1:nrow(ebac_doms_metab_mat), ebac_doms_train) # rest is part of the test set

# store matrices into training and test set:
ebac_doms_metab_mat.train <- ebac_doms_metab_mat[ebac_doms_train, ]
ebac_doms_metab_mat.test <- ebac_doms_metab_mat[ebac_doms_test,]
ebac_doms_metab_labs.train <- ebac_doms_metab_labs[ebac_doms_train]
ebac_doms_metab_labs.test <- ebac_doms_metab_labs[ebac_doms_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
ebac_doms_train_splsda <- mixOmics::splsda(ebac_doms_metab_mat.train, ebac_doms_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
ebac_doms_train_plsda_perf <-
  perf(
    ebac_doms_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  ebac_doms_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 2 or 4 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
ebac_doms_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
ebac_doms_train_tune_splsda <-
  mixOmics::tune.splsda(
    ebac_doms_metab_mat.train,
    ebac_doms_metab_labs.train,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = ebac_doms_train_keepX,
    nrepeat = 50
  )

plot(ebac_doms_train_tune_splsda, col = color.jet(4))

ebac_doms_train_error <- ebac_doms_train_tune_splsda$error.rate
ebac_doms_train_ncomp <- ebac_doms_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
ebac_doms_train_ncomp #1 components are optimal
## [1] 1
ebac_doms_train_select_keepX <- ebac_doms_train_tune_splsda$choice.keepX[1:ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp)]  # optimal number of variables to select per component
ebac_doms_train_select_keepX
## comp1 comp2 
##     1    10
# Final Model
ebac_doms_train_splsda_final <-
  mixOmics::splsda(ebac_doms_metab_mat.train, ebac_doms_metab_labs.train, ncomp = ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp), keepX = ebac_doms_train_select_keepX)

# Test the model
ebac_doms_predict_train_splsda_final <- predict(ebac_doms_train_splsda_final, ebac_doms_metab_mat.test, 
                                dist = "max.dist")

ebac_doms_predict_train_comp2 <- ebac_doms_predict_train_splsda_final$class$max.dist[,ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp)]

ebac_doms_union <- union(ebac_doms_predict_train_comp2, ebac_doms_metab_labs.test)

confusionMatrix(table(factor(ebac_doms_predict_train_comp2, ebac_doms_union),
                      factor(ebac_doms_metab_labs.test, ebac_doms_union)),
                 negative = "0")
## Confusion Matrix and Statistics
## 
##               
##                Expansion No Expansion
##   Expansion            1            1
##   No Expansion         6           25
##                                           
##                Accuracy : 0.7879          
##                  95% CI : (0.6109, 0.9102)
##     No Information Rate : 0.7879          
##     P-Value [Acc > NIR] : 0.5994          
##                                           
##                   Kappa : 0.1413          
##                                           
##  Mcnemar's Test P-Value : 0.1306          
##                                           
##             Sensitivity : 0.14286         
##             Specificity : 0.96154         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.80645         
##              Prevalence : 0.21212         
##          Detection Rate : 0.03030         
##    Detection Prevalence : 0.06061         
##       Balanced Accuracy : 0.55220         
##                                           
##        'Positive' Class : Expansion       
## 
ebac_doms_train_background <- background.predict(ebac_doms_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
ebac_doms_tot <- predict(ebac_doms_train_splsda_final,
                        ebac_doms_metab_mat,
                        dist = "max.dist")

ebac_doms_tot_predict <- ebac_doms_tot$class$max.dist[,ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp)]

ebac_doms_tot_union <- union(ebac_doms_tot_predict, ebac_doms_metab_labs)

ebac_doms_cm <- confusionMatrix(table(factor(ebac_doms_tot_predict, ebac_doms_tot_union,
                                        levels = c("No Expansion","Expansion")),
                      factor(ebac_doms_metab_labs, ebac_doms_tot_union,
                             levels = c("No Expansion","Expansion"))),
                      positive = "Expansion")
ebac_doms_cm
## Confusion Matrix and Statistics
## 
##               
##                Expansion No Expansion
##   Expansion           78           20
##   No Expansion         1            8
##                                           
##                Accuracy : 0.8037          
##                  95% CI : (0.7158, 0.8742)
##     No Information Rate : 0.7383          
##     P-Value [Acc > NIR] : 0.07337         
##                                           
##                   Kappa : 0.3496          
##                                           
##  Mcnemar's Test P-Value : 8.568e-05       
##                                           
##             Sensitivity : 0.9873          
##             Specificity : 0.2857          
##          Pos Pred Value : 0.7959          
##          Neg Pred Value : 0.8889          
##              Prevalence : 0.7383          
##          Detection Rate : 0.7290          
##    Detection Prevalence : 0.9159          
##       Balanced Accuracy : 0.6365          
##                                           
##        'Positive' Class : Expansion       
## 
# Additional model measures
ebac_doms_epi <- epiR::epi.tests(table(ebac_doms_tot_predict, ebac_doms_metab_labs), conf.level = 0.95)

ebac_doms_confusion_df <- ebac_doms_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nExpansion",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Expansion",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nExpansion" = "Test +",
                "Predicted\nNo Expansion" = "Test -") %>% 
  column_to_rownames(var = "actual")


{
pdf(file = "./Results/Figure_6C.pdf", height = 10, width = 10)
plotIndiv(
  ebac_doms_train_splsda_final,
  comp = c(1,2),
  ylim = c(-5,16),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ebac_doms_train_background,
  col = c("#FF0000", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-3,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-10,
  ebac_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 12,
  x = 1.25,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#FF0000")
text(
  y = -4,
  x = 0.5,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.25,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("ACC = ", 
                    round(ebac_doms_cm$overall[1]*100, 1), "% [",
                    round(ebac_doms_cm$overall[3]*100, 1), "%, ",
                    round(ebac_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.05,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Sens. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.85,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Spec. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.65,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("OR = ",
                    formatC(round(ebac_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ebac_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ebac_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

invisible(dev.off())
}

plotIndiv(
  ebac_doms_train_splsda_final,
  comp = c(1,2),
  ylim = c(-5,16),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ebac_doms_train_background,
  col = c("#FF0000", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-3,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-10,
  ebac_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 12,
  x = 1.25,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#FF0000")
text(
  y = -4,
  x = 0.5,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.25,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("ACC = ", 
                    round(ebac_doms_cm$overall[1]*100, 1), "% [",
                    round(ebac_doms_cm$overall[3]*100, 1), "%, ",
                    round(ebac_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.05,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Sens. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.85,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Spec. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.65,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("OR = ",
                    formatC(round(ebac_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ebac_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ebac_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

Figure 7A: sPLS-DA Any Bacterial Infection using Qualitative Metabolomics

infx_metab_mat <-
  metab_qual_anon %>%
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
       compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  right_join(peri_matrix_all %>% select(sampleID, bact_infection_present)) %>% 
  group_by(sampleID, compound, bact_infection_present) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue, bact_infection_present) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  select(-bact_infection_present) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`) %>% 
  filter_all(any_vars(!is.na(.)))

  
infx_metab_labs <-
  infx_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(peri_matrix_all %>% select(sampleID, bact_infection_present)) %>% 
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  pull(bact_infection_present)

dim(infx_metab_mat) #107 93 (means 93 compounds and 108 LT patients/infections)
## [1] 107  93
length(infx_metab_labs) #107 (means 108 LT patients/infections)
## [1] 107
# Begin model training
set.seed(1234)
infx_train <- sample(1:nrow(infx_metab_mat), as.integer(0.7*nrow(infx_metab_mat))) # randomly select 70% samples in training
infx_test <- setdiff(1:nrow(infx_metab_mat), infx_train) # rest is part of the test set

# store matrices into training and test set:
infx_metab_mat.train <- infx_metab_mat[infx_train, ]
infx_metab_mat.test <- infx_metab_mat[infx_test,]
infx_metab_labs.train <- infx_metab_labs[infx_train]
infx_metab_labs.test <- infx_metab_labs[infx_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
infx_train_splsda <- mixOmics::splsda(infx_metab_mat.train, infx_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 100-repeat cross validation
set.seed(1234)
infx_train_plsda_perf <-
  perf(
    infx_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  infx_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 1 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
infx_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
infx_train_tune_splsda <-
  mixOmics::tune.splsda(
    infx_metab_mat.train,
    infx_metab_labs.train,
    ncomp = 3, # Choose 3 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = infx_train_keepX,
    nrepeat = 50
  )

plot(infx_train_tune_splsda, col = color.jet(3))

infx_train_error <- infx_train_tune_splsda$error.rate
infx_train_ncomp <- infx_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
infx_train_ncomp #1 component is optimal
## [1] 1
infx_train_select_keepX <- infx_train_tune_splsda$choice.keepX[1:(infx_train_ncomp + 1)]  # optimal number of variables to select per component
infx_train_select_keepX
## comp1 comp2 
##    90     3
# Final Model
infx_train_splsda_final <-
  mixOmics::splsda(infx_metab_mat.train, infx_metab_labs.train, ncomp = (infx_train_ncomp + 1), keepX = infx_train_select_keepX)

# Test the model
infx_predict_train_splsda_final <- predict(infx_train_splsda_final, infx_metab_mat.test, 
                                dist = "max.dist")

infx_predict_train_comp2 <- infx_predict_train_splsda_final$class$max.dist[,(infx_train_ncomp + 1)]

infx_union <- union(infx_predict_train_comp2, infx_metab_labs.test)

confusionMatrix(table(factor(infx_predict_train_comp2, infx_union),
                      factor(infx_metab_labs.test, infx_union)),
                 positive = "Infection")
## Confusion Matrix and Statistics
## 
##               
##                No Infection Infection
##   No Infection           20         6
##   Infection               6         1
##                                          
##                Accuracy : 0.6364         
##                  95% CI : (0.4512, 0.796)
##     No Information Rate : 0.7879         
##     P-Value [Acc > NIR] : 0.9865         
##                                          
##                   Kappa : -0.0879        
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.1429         
##             Specificity : 0.7692         
##          Pos Pred Value : 0.1429         
##          Neg Pred Value : 0.7692         
##              Prevalence : 0.2121         
##          Detection Rate : 0.0303         
##    Detection Prevalence : 0.2121         
##       Balanced Accuracy : 0.4560         
##                                          
##        'Positive' Class : Infection      
## 
infx_train_background <- background.predict(infx_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
infx_tot <- predict(infx_train_splsda_final,
                        infx_metab_mat,
                        dist = "max.dist")

infx_tot_predict <- infx_tot$class$max.dist[,(infx_train_ncomp + 1)]

infx_tot_union <- union(infx_tot_predict, infx_metab_labs)

infx_cm <- confusionMatrix(table(factor(infx_tot_predict, infx_tot_union,
                                        levels = c("Infection", "No Infection")),
                      factor(infx_metab_labs, infx_tot_union,
                             levels = c("Infection", "No Infection"))),
                      positive = "Infection")
infx_cm
## Confusion Matrix and Statistics
## 
##               
##                No Infection Infection
##   No Infection           12         7
##   Infection              15        73
##                                           
##                Accuracy : 0.7944          
##                  95% CI : (0.7054, 0.8664)
##     No Information Rate : 0.7477          
##     P-Value [Acc > NIR] : 0.1582          
##                                           
##                   Kappa : 0.3958          
##                                           
##  Mcnemar's Test P-Value : 0.1356          
##                                           
##             Sensitivity : 0.9125          
##             Specificity : 0.4444          
##          Pos Pred Value : 0.8295          
##          Neg Pred Value : 0.6316          
##              Prevalence : 0.7477          
##          Detection Rate : 0.6822          
##    Detection Prevalence : 0.8224          
##       Balanced Accuracy : 0.6785          
##                                           
##        'Positive' Class : Infection       
## 
# Additional model measures
infx_epi <- epiR::epi.tests(table(infx_tot_predict, infx_metab_labs), conf.level = 0.95)

infx_confusion_df <- infx_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nInfection",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Infection",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nInfection" = "Test +",
                "Predicted\nNo Infection" = "Test -") %>% 
  column_to_rownames(var = "actual")

{
pdf(file = "./Results/Figure_7A.pdf", height = 10, width = 10)
plotIndiv(
  infx_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = infx_train_background,
  col = c("goldenrod", "gray75"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Infection Group",
  X.label = paste0("Component 1 (", round(infx_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(infx_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = max(infx_train_splsda_final$variates$X[,2])*0.65,
  x = max(infx_train_splsda_final$variates$X[,1])*0.4,
  infx_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 0,
  x = 6.5,
  substitute(bold("Infection")), cex = 1.75, adj = 0, col = "goldenrod")
text(
  y = 4,
  x = -4.5,
  substitute(bold("No Infection")), cex = 1.75, adj = 0, col = "gray70")
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.6,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("ACC = ", 
                    round(infx_cm$overall[1]*100, 1), "% [",
                    round(infx_cm$overall[3]*100, 1), "%, ",
                    round(infx_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.55,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Sens. = ", 
                    paste0(formatC(round(infx_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.5,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Spec. = ", 
                    paste0(formatC(round(infx_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.45,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("OR = ",
                    formatC(round(infx_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(infx_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(infx_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)
invisible(dev.off())
}

plotIndiv(
  infx_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = infx_train_background,
  col = c("goldenrod", "gray75"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Infection Group",
  X.label = paste0("Component 1 (", round(infx_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(infx_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = max(infx_train_splsda_final$variates$X[,2])*0.65,
  x = max(infx_train_splsda_final$variates$X[,1])*0.4,
  infx_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 0,
  x = 6.5,
  substitute(bold("Infection")), cex = 1.75, adj = 0, col = "goldenrod")
text(
  y = 4,
  x = -4.5,
  substitute(bold("No Infection")), cex = 1.75, adj = 0, col = "gray70")
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.6,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("ACC = ", 
                    round(infx_cm$overall[1]*100, 1), "% [",
                    round(infx_cm$overall[3]*100, 1), "%, ",
                    round(infx_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.55,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Sens. = ", 
                    paste0(formatC(round(infx_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.5,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Spec. = ", 
                    paste0(formatC(round(infx_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.45,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("OR = ",
                    formatC(round(infx_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(infx_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(infx_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

Combine Figure 6A + 6C: sPLS-DA Enterococcus and Enterobacterales Expansion

axis.title.cex = 3.275
axis.text.cex = 2.5
plot.text.cex = 2.5
plot.group.cex = 2.5
point.cex = 3

{
    pdf(file = "./Results/Figure_6AC.pdf", height = 17, width = 11)
    par(mfrow = c(2, 1), mar = c(5, 5, 1, 1))

    par(mar = c(5, 5, 2, 1))

    # Figure 6A
    plotIndiv(ecoc_doms_train_splsda_final, comp = c(1, 2), pch = 1,
        ind.names = FALSE, legend = FALSE, background = ecoc_doms_train_background,
        col = c("#0C7A3A", "black"), star = TRUE, point.lwd = 0.5,
        title = NULL, size.title = 1e-05, style = "graphics",
        legend.title = "Expansion", X.label = paste0("Component 1 (",
            round(ecoc_doms_train_splsda_final$prop_expl_var$X[1] *
                100), "%)"), Y.label = paste0("Component 2 (",
            round(ecoc_doms_train_splsda_final$prop_expl_var$X[2] *
                100), "%)"))
    addtable2plot(y = min(ecoc_doms_train_splsda_final$variates$X[,
        2]) * 1.05, x = min(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 1.15, ecoc_doms_confusion_df, bty = "o", display.rownames = TRUE,
        hlines = TRUE, vlines = TRUE, cex = 0.75, bg = "white")
    text(y = -3.5, x = 1.5, substitute(bold("Expansion")), cex = 1.75,
        adj = 0, col = "#0C7A3A")
    text(y = 3, x = 0.75, substitute(bold("No Expansion")), cex = 1.75,
        adj = 0, col = "black")
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.825, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("ACC = ", round(ecoc_doms_cm$overall[1] *
        100, 1), "% [", round(ecoc_doms_cm$overall[3] * 100,
        1), "%, ", round(ecoc_doms_cm$overall[4] * 100, 1), "%]"),
        cex = 0.75, adj = 0)
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.875, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("Sens. = ", paste0(formatC(round(ecoc_doms_epi$detail[2][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ecoc_doms_epi$detail[3][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ecoc_doms_epi$detail[4][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.925, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("Spec. = ", paste0(formatC(round(ecoc_doms_epi$detail[2][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ecoc_doms_epi$detail[3][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ecoc_doms_epi$detail[4][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.975, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("OR = ", formatC(round(ecoc_doms_epi$detail[2][6,
        ], 3), digits = 1, format = "f"), " [", formatC(round(ecoc_doms_epi$detail[3][6,
        ], 3), digits = 1, format = "f"), ", ", formatC(round(ecoc_doms_epi$detail[4][6,
        ], 3), digits = 1, format = "f"), "]"), cex = 0.75, adj = 0)

    # Figure 6C
    par(mar = c(5, 5, 2, 1))
    plotIndiv(ebac_doms_train_splsda_final, comp = c(1, 2), ylim = c(-5,
        16), pch = 1, ind.names = FALSE, legend = FALSE, background = ebac_doms_train_background,
        col = c("#FF0000", "black"), star = TRUE, point.lwd = 0.5,
        title = NULL, size.title = 1e-05, style = "graphics",
        legend.title = "Expansion", X.label = paste0("Component 1 (",
            round(ebac_doms_train_splsda_final$prop_expl_var$X[1] *
                100), "%)"), Y.label = paste0("Component 2 (",
            round(ebac_doms_train_splsda_final$prop_expl_var$X[2] *
                100), "%)"))
    addtable2plot(y = min(ebac_doms_train_splsda_final$variates$X[,
        2]) * -3, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -10, ebac_doms_confusion_df, bty = "o", display.rownames = TRUE,
        hlines = TRUE, vlines = TRUE, cex = 0.75, bg = "white")
    text(y = 12, x = 1.25, substitute(bold("Expansion")), cex = 1.75,
        adj = 0, col = "#FF0000")
    text(y = -4, x = 0.5, substitute(bold("No Expansion")), cex = 1.75,
        adj = 0, col = "black")
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -2.25, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("ACC = ", round(ebac_doms_cm$overall[1] *
        100, 1), "% [", round(ebac_doms_cm$overall[3] * 100,
        1), "%, ", round(ebac_doms_cm$overall[4] * 100, 1), "%]"),
        cex = 0.75, adj = 0)
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -2.05, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("Sens. = ", paste0(formatC(round(ebac_doms_epi$detail[2][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ebac_doms_epi$detail[3][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ebac_doms_epi$detail[4][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -1.85, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("Spec. = ", paste0(formatC(round(ebac_doms_epi$detail[2][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ebac_doms_epi$detail[3][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ebac_doms_epi$detail[4][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -1.65, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("OR = ", formatC(round(ebac_doms_epi$detail[2][6,
        ], 3), digits = 1, format = "f"), " [", formatC(round(ebac_doms_epi$detail[3][6,
        ], 3), digits = 1, format = "f"), ", ", formatC(round(ebac_doms_epi$detail[4][6,
        ], 3), digits = 1, format = "f"), "]"), cex = 0.75, adj = 0)

    invisible(dev.off())
}

Supplemental Figure 3: Plot Loadings for Diversity Groups

# Take absolute value to make diversity group loadings plot narrower
diversity_train_splsda_final$loadings$X <- abs(diversity_train_splsda_final$loadings$X)

# Combine Supplemental Figures 3A + 3B

{pdf(file = "./Results/Supplemental_Figure_3.pdf",
         height = 8, 
         width = 11)

par(mfrow = c(1,3))

# Supplmental Figure 3A
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 1,
  legend = FALSE,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 1.1,
  size.title = rel(1),
  ndisplay = 50
)

# Supplmental Figure 3B
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 2,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 1.1,
  size.title = rel(1),
  ndisplay = 50
)

invisible(dev.off())
}

par(mfrow = c(1,3))

# Supplmental Figure 3A
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 1,
  legend = FALSE,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 0.6,
  size.title = rel(1),
  ndisplay = 50
)

# Supplmental Figure 3B
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 2,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 0.6,
  size.title = rel(1),
  ndisplay = 50
)

#### Metaphlan + Qual Metabolites ####
heatmap_data_raw <- t_metaphlan %>% 
  drop_na(taxid) %>% 
  select(sampleID) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant", "Healthy Donor")) %>%
  left_join(metab_qual_anon) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>% 
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  group_by(sampleID, compound) %>% 
  slice_max(mvalue, with_ties = F, n = 1) %>% 
  ungroup() %>%
  select(-db) %>% 
  left_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db)) %>%
               distinct(sampleID, db), by = "sampleID") %>% 
  group_by(sampleID, compound) %>% 
  slice_max(mvalue, with_ties = F, n = 1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup()

diversity_train_splsda_final_comp2 <- mixOmics::selectVar(diversity_train_splsda_final, comp = 2)$name

gg_diversity_comp2 <- metaphlan_df2 %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>%
  select(sampleID, diversity_group) %>% 
  distinct(sampleID, diversity_group) %>% 
  left_join(heatmap_data_raw) %>% 
  filter(compound %in% diversity_train_splsda_final_comp2 | is.na(compound)) %>% 
  mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity", "Medium Diversity",
                                                              "High Diversity", "Healthy Donor"))) %>%
  pivot_wider(id_cols = c(sampleID, diversity_group, patientID, db, Shannon), names_from = "compound", values_from = "mvalue", values_fill = NA) %>%  # For missing values, just for plotting
  pivot_longer(-c(sampleID, diversity_group, patientID, db, Shannon), names_to = "compound", values_to = "mvalue") %>% 
  mutate(compound = factor(compound, levels = c(diversity_train_splsda_final_comp2))) %>% 
  # filter(compound == "Omega-Muricholic Acid") %>% arrange(desc(mvalue))
  filter(compound != "NA") %>% 
  group_by(sampleID, compound) %>% 
  dplyr::slice(1) %>% 
  ggplot(aes(x = reorder(sampleID, Shannon), y = mvalue, fill = diversity_group)) +
  geom_col() +
  scale_fill_manual(values = diversity_group_colors) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x=eb(),
        axis.ticks.x=eb(),
        # strip.text.x= et(angle = 0, size = 14, color = "black"),
        strip.text.x = eb(),
        strip.background = eb(),
        strip.text.y = et(angle = 0, size = 14, color = "black", hjust = 0),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 5,
                             r = 5, 
                             b = 0, 
                             l = 5),
        panel.grid.minor = eb()) +
  facet_grid(compound~diversity_group, scales = "free")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005))) +
  ylab("Normalized Peak Area") +
  xlab("")

  

pdf(file = "./Results/Metahphlan_Medium_Diversity_Compounds.pdf", height = 16, width = 20, onefile = FALSE)
gg.stack(gg_metaphlan,
         gg_diversity_comp2,
         heights = c(1, 4))
dev.off()
## quartz_off_screen 
##                 2

Supplemental Figure 6A+B: Plot Loadings for Enterococcus Expansion

# Take absolute value to make diversity group loadings plot
# narrower
ecoc_doms_train_splsda_final$loadings$X <- abs(ecoc_doms_train_splsda_final$loadings$X)

# Supplemental Figure 5AB

{
    pdf(file = "./Results/Supplemental_Figure_6AB.pdf", height = 8,
        width = 11)

    par(mfrow = c(1, 3))

    plotLoadings(ecoc_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("#0C7A3A",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    plotLoadings(ecoc_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("#0C7A3A",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))


plotLoadings(ecoc_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("#0C7A3A", "black"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

plotLoadings(ecoc_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("#0C7A3A", "black"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Supplemental Figure 6C+D: Plot Loadings for Enterobacterales Expansion

# Take absolute value to make diversity group loadings plot
# narrower
ebac_doms_train_splsda_final$loadings$X <- abs(ebac_doms_train_splsda_final$loadings$X)

# Supplemental Figure 5CD

{
    pdf(file = "./Results/Supplemental_Figure_6CD.pdf", height = 8,
        width = 11)

    par(mfrow = c(1, 3))

    plotLoadings(ebac_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("#FF0000",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    plotLoadings(ebac_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("#FF0000",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))


plotLoadings(ebac_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("#FF0000", "black"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

plotLoadings(ebac_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("#FF0000", "black"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Supplemental Figure 7A+B: Plot Loadings for Any Bacterial Infection

# Take absolute value to make diversity group loadings plot
# narrower
infx_train_splsda_final$loadings$X <- abs(infx_train_splsda_final$loadings$X)

{
    pdf(file = "./Results/Supplemental_Figure_7.pdf", height = 8,
        width = 11)

    par(mfrow = c(1, 3))

    # Supplmental Figure 7A
    plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
        comp = 1, legend = FALSE, legend.col = c("goldenrod",
            "gray75"), size.name = 1.1, size.title = rel(1),
        ndisplay = 50)

    # Supplmental Figure 7B
    plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
        comp = 2, legend.col = c("goldenrod", "gray75"), size.name = 1.1,
        size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))

# Supplmental Figure 7A
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("goldenrod", "gray75"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Supplmental Figure 7B
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("goldenrod", "gray75"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Tables

Supplemental Figure 5: Flow Diagram

flow_chart <- flow_exclusions(incl_counts = c(158, 130, 107,
    25), total_label = "Total Patients Enrolled", incl_labels = c("Patients w/ Transplant",
    "Patients Included", "Patients w/ Bacterial Infection"),
    excl_labels = c("No Transplant", "No Sample In Study Period\nDay -7 to +30",
        "Patients w/o Bacterial Infection"), show_count = TRUE)

flow_chart
flow_chart %>%
    export_svg() %>%
    charToRaw() %>%
    rsvg_pdf("./Results/Supplemental_Figure_5.pdf")

Demographics

## Vector of variables to summarize
demo_vars <- c("race", "sex", "age", "meld_transplant", "Alcoholic Hepatitis",
    "Alcoholic Cirrhosis", "NAFLD/NASH", "Primary Sclerosing Cholangitis",
    "Acute Viral Hepatitis", "Chronic Hepatitis B", "Chronic Hepatitis C",
    "Autoimmune", "Wilson's Disease", "Alpha-1 Antitrypsin",
    "Hemachromatosis", "Drug Induced Liver Injury or Toxin",
    "Budd Chiari", "Cryptogenic", "Malignancy", "Other", "Dialysis",
    "Pressers", "Mechanical Ventilation")
## Vector of categorical variables that need transformation
demo_cats <- c("race", "sex", "Alcoholic Hepatitis", "Alcoholic Cirrhosis",
    "NAFLD/NASH", "Primary Sclerosing Cholangitis", "Acute Viral Hepatitis",
    "Chronic Hepatitis B", "Chronic Hepatitis C", "Autoimmune",
    "Wilson's Disease", "Alpha-1 Antitrypsin", "Hemachromatosis",
    "Drug Induced Liver Injury or Toxin", "Budd Chiari", "Cryptogenic",
    "Malignancy", "Other", "Dialysis", "Pressers", "Mechanical Ventilation")


tab1_1 <- CreateTableOne(vars = demo_vars, testNonNormal = "wilcox.test",
    includeNA = TRUE, factorVars = demo_cats, strata = "any_infection",
    data = demo)

summary(tab1_1)  # Age is potentially skewed, need to state that it is skewed and re-run `CreateTableOne`
## 
##      ### Summary of continuous variables ###
## 
## any_infection: 0
##                  n miss p.miss mean sd median p25 p75 min max  skew kurt
## age             82    0      0   51 17     55  40  63   2  77 -1.00  0.8
## meld_transplant 82    0      0   26 10     28  19  33   6  49 -0.04 -0.7
## ------------------------------------------------------------ 
## any_infection: 1
##                  n miss p.miss mean sd median p25 p75 min max skew   kurt
## age             25    0      0   56 15     59  46  68  22  73 -0.9  0.002
## meld_transplant 25    0      0   27 10     30  19  33  11  42 -0.3 -1.057
## 
## p-values
##                   pNormal pNonNormal
## age             0.2072738  0.1765252
## meld_transplant 0.6624494  0.6085202
## 
## Standardize mean differences
##                    1 vs 2
## age             0.2976093
## meld_transplant 0.1025141
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## any_infection: 0
##                                 var  n miss p.miss
##                                race 82    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 82    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 82    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 82    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 82    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 82    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 82    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 82    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 82    0    0.0
##                                                   
##                                                   
##                          Autoimmune 82    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 82    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 82    0    0.0
##                                                   
##                     Hemachromatosis 82    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 82    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 82    0    0.0
##                                                   
##                         Cryptogenic 82    0    0.0
##                                                   
##                                                   
##                          Malignancy 82    0    0.0
##                                                   
##                                                   
##                               Other 82    0    0.0
##                                                   
##                                                   
##                            Dialysis 82    0    0.0
##                                                   
##                                                   
##                            Pressers 82    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 82    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    1     1.2         1.2
##              Asian/Mideast Indian    6     7.3         8.5
##            Black/African-American    9    11.0        19.5
##                More than one Race    8     9.8        29.3
##                  Patient Declined    4     4.9        34.1
##                           Unknown    0     0.0        34.1
##                             White   54    65.9       100.0
##                                                           
##                            Female   38    46.3        46.3
##                              Male   44    53.7       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   42    51.2        51.2
##                                 1   40    48.8       100.0
##                                                           
##                                 0   72    87.8        87.8
##                                 1   10    12.2       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   77    93.9        93.9
##                                 1    5     6.1       100.0
##                                                           
##                                 0   80    97.6        97.6
##                                 1    2     2.4       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   81    98.8        98.8
##                                 1    1     1.2       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   65    79.3        79.3
##                                 1   17    20.7       100.0
##                                                           
##                                 0   74    90.2        90.2
##                                 1    8     9.8       100.0
##                                                           
##                                 0   60    73.2        73.2
##                                 1   22    26.8       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   77    93.9        93.9
##                                 1    5     6.1       100.0
##                                                           
## ------------------------------------------------------------ 
## any_infection: 1
##                                 var  n miss p.miss
##                                race 25    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 25    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 25    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 25    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 25    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 25    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 25    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 25    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 25    0    0.0
##                                                   
##                                                   
##                          Autoimmune 25    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 25    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 25    0    0.0
##                                                   
##                     Hemachromatosis 25    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 25    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 25    0    0.0
##                                                   
##                         Cryptogenic 25    0    0.0
##                                                   
##                                                   
##                          Malignancy 25    0    0.0
##                                                   
##                                                   
##                               Other 25    0    0.0
##                                                   
##                                                   
##                            Dialysis 25    0    0.0
##                                                   
##                                                   
##                            Pressers 25    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 25    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    0     0.0         0.0
##              Asian/Mideast Indian    2     8.0         8.0
##            Black/African-American    2     8.0        16.0
##                More than one Race    2     8.0        24.0
##                  Patient Declined    1     4.0        28.0
##                           Unknown    2     8.0        36.0
##                             White   16    64.0       100.0
##                                                           
##                            Female    9    36.0        36.0
##                              Male   16    64.0       100.0
##                                                           
##                                 0   23    92.0        92.0
##                                 1    2     8.0       100.0
##                                                           
##                                 0   17    68.0        68.0
##                                 1    8    32.0       100.0
##                                                           
##                                 0   19    76.0        76.0
##                                 1    6    24.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   19    76.0        76.0
##                                 1    6    24.0       100.0
##                                                           
##                                 0   20    80.0        80.0
##                                 1    5    20.0       100.0
##                                                           
##                                 0   16    64.0        64.0
##                                 1    9    36.0       100.0
##                                                           
##                                 0   20    80.0        80.0
##                                 1    5    20.0       100.0
##                                                           
##                                 0   23    92.0        92.0
##                                 1    2     8.0       100.0
##                                                           
## 
## p-values
##                                      pApprox    pExact
## race                               0.3074905 0.4275466
## sex                                0.4953032 0.4904875
## Alcoholic Hepatitis                1.0000000 1.0000000
## Alcoholic Cirrhosis                0.2123469 0.1713637
## NAFLD/NASH                         0.2590605 0.1979409
## Primary Sclerosing Cholangitis     0.3704754 0.3322151
## Acute Viral Hepatitis              0.6007080 0.5709809
## Chronic Hepatitis B                0.5271112 0.2336449
## Chronic Hepatitis C                0.6007080 0.5709809
## Autoimmune                         0.4694777 0.5886832
## Wilson's Disease                   1.0000000 0.5538202
## Alpha-1 Antitrypsin                       NA        NA
## Hemachromatosis                    0.5271112 0.2336449
## Drug Induced Liver Injury or Toxin 1.0000000 1.0000000
## Budd Chiari                               NA        NA
## Cryptogenic                        1.0000000 1.0000000
## Malignancy                         0.9440592 0.7828238
## Other                              0.3063991 0.1774710
## Dialysis                           0.5266900 0.4513768
## Pressers                           0.1465607 0.1238754
## Mechanical Ventilation             1.0000000 0.6642869
## 
## Standardize mean differences
##                                        1 vs 2
## race                               0.45891730
## sex                                0.21130091
## Alcoholic Hepatitis                0.02568258
## Alcoholic Cirrhosis                0.34709743
## NAFLD/NASH                         0.31029007
## Primary Sclerosing Cholangitis     0.39735971
## Acute Viral Hepatitis              0.32025631
## Chronic Hepatitis B                0.28867513
## Chronic Hepatitis C                0.32025631
## Autoimmune                         0.36037499
## Wilson's Disease                   0.08851811
## Alpha-1 Antitrypsin                0.00000000
## Hemachromatosis                    0.28867513
## Drug Induced Liver Injury or Toxin 0.15713484
## Budd Chiari                        0.00000000
## Cryptogenic                        0.04264158
## Malignancy                         0.07849392
## Other                              0.29088216
## Dialysis                           0.19854168
## Pressers                           0.37578690
## Mechanical Ventilation             0.07437488
tableone_skewed <- c("age", "meld_transplant")

tab1_2 <- print(tab1_1, nonnormal = tableone_skewed, formatOptions = list(big.mark = ","))
##                                             Stratified by any_infection
##                                              0                   
##   n                                             82               
##   race (%)                                                       
##      American Indian or Alaska Native            1 (  1.2)       
##      Asian/Mideast Indian                        6 (  7.3)       
##      Black/African-American                      9 ( 11.0)       
##      More than one Race                          8 (  9.8)       
##      Patient Declined                            4 (  4.9)       
##      Unknown                                     0 (  0.0)       
##      White                                      54 ( 65.9)       
##   sex = Male (%)                                44 ( 53.7)       
##   age (median [IQR])                         55.00 [39.50, 63.00]
##   meld_transplant (median [IQR])             28.00 [19.00, 33.00]
##   Alcoholic Hepatitis = 1 (%)                    6 (  7.3)       
##   Alcoholic Cirrhosis = 1 (%)                   40 ( 48.8)       
##   NAFLD/NASH = 1 (%)                            10 ( 12.2)       
##   Primary Sclerosing Cholangitis = 1 (%)         6 (  7.3)       
##   Acute Viral Hepatitis = 1 (%)                  4 (  4.9)       
##   Chronic Hepatitis B = 1 (%)                    0 (  0.0)       
##   Chronic Hepatitis C = 1 (%)                    4 (  4.9)       
##   Autoimmune = 1 (%)                             5 (  6.1)       
##   Wilson's Disease = 1 (%)                       2 (  2.4)       
##   Alpha-1 Antitrypsin = 0 (%)                   82 (100.0)       
##   Hemachromatosis = 1 (%)                        0 (  0.0)       
##   Drug Induced Liver Injury or Toxin = 1 (%)     1 (  1.2)       
##   Budd Chiari = 0 (%)                           82 (100.0)       
##   Cryptogenic = 1 (%)                            4 (  4.9)       
##   Malignancy = 1 (%)                            17 ( 20.7)       
##   Other = 1 (%)                                  8 (  9.8)       
##   Dialysis = 1 (%)                              22 ( 26.8)       
##   Pressers = 1 (%)                               6 (  7.3)       
##   Mechanical Ventilation = 1 (%)                 5 (  6.1)       
##                                             Stratified by any_infection
##                                              1                    p     
##   n                                             25                      
##   race (%)                                                         0.307
##      American Indian or Alaska Native            0 (  0.0)              
##      Asian/Mideast Indian                        2 (  8.0)              
##      Black/African-American                      2 (  8.0)              
##      More than one Race                          2 (  8.0)              
##      Patient Declined                            1 (  4.0)              
##      Unknown                                     2 (  8.0)              
##      White                                      16 ( 64.0)              
##   sex = Male (%)                                16 ( 64.0)         0.495
##   age (median [IQR])                         59.00 [46.00, 68.00]  0.177
##   meld_transplant (median [IQR])             30.00 [19.00, 33.00]  0.609
##   Alcoholic Hepatitis = 1 (%)                    2 (  8.0)         1.000
##   Alcoholic Cirrhosis = 1 (%)                    8 ( 32.0)         0.212
##   NAFLD/NASH = 1 (%)                             6 ( 24.0)         0.259
##   Primary Sclerosing Cholangitis = 1 (%)         0 (  0.0)         0.370
##   Acute Viral Hepatitis = 1 (%)                  0 (  0.0)         0.601
##   Chronic Hepatitis B = 1 (%)                    1 (  4.0)         0.527
##   Chronic Hepatitis C = 1 (%)                    0 (  0.0)         0.601
##   Autoimmune = 1 (%)                             0 (  0.0)         0.469
##   Wilson's Disease = 1 (%)                       1 (  4.0)         1.000
##   Alpha-1 Antitrypsin = 0 (%)                   25 (100.0)            NA
##   Hemachromatosis = 1 (%)                        1 (  4.0)         0.527
##   Drug Induced Liver Injury or Toxin = 1 (%)     0 (  0.0)         1.000
##   Budd Chiari = 0 (%)                           25 (100.0)            NA
##   Cryptogenic = 1 (%)                            1 (  4.0)         1.000
##   Malignancy = 1 (%)                             6 ( 24.0)         0.944
##   Other = 1 (%)                                  5 ( 20.0)         0.306
##   Dialysis = 1 (%)                               9 ( 36.0)         0.527
##   Pressers = 1 (%)                               5 ( 20.0)         0.147
##   Mechanical Ventilation = 1 (%)                 2 (  8.0)         1.000
##                                             Stratified by any_infection
##                                              test   
##   n                                                 
##   race (%)                                          
##      American Indian or Alaska Native               
##      Asian/Mideast Indian                           
##      Black/African-American                         
##      More than one Race                             
##      Patient Declined                               
##      Unknown                                        
##      White                                          
##   sex = Male (%)                                    
##   age (median [IQR])                         nonnorm
##   meld_transplant (median [IQR])             nonnorm
##   Alcoholic Hepatitis = 1 (%)                       
##   Alcoholic Cirrhosis = 1 (%)                       
##   NAFLD/NASH = 1 (%)                                
##   Primary Sclerosing Cholangitis = 1 (%)            
##   Acute Viral Hepatitis = 1 (%)                     
##   Chronic Hepatitis B = 1 (%)                       
##   Chronic Hepatitis C = 1 (%)                       
##   Autoimmune = 1 (%)                                
##   Wilson's Disease = 1 (%)                          
##   Alpha-1 Antitrypsin = 0 (%)                       
##   Hemachromatosis = 1 (%)                           
##   Drug Induced Liver Injury or Toxin = 1 (%)        
##   Budd Chiari = 0 (%)                               
##   Cryptogenic = 1 (%)                               
##   Malignancy = 1 (%)                                
##   Other = 1 (%)                                     
##   Dialysis = 1 (%)                                  
##   Pressers = 1 (%)                                  
##   Mechanical Ventilation = 1 (%)
write.csv(tab1_2, "./Results/Demo_Table_1.csv", row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
tab1_2_padjust1 <- read.csv("./Results/Demo_Table_1.csv") %>%
    dplyr::rename(` ` = X, `No Infection` = X0, `Bacterial Infection` = X1)

tab1_2_padjust2 <- tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = tab1_2_padjust1$` `))

tab1_2_padjust3 <- tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & test == "", "chi.sq", test)) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(tab1_2_padjust3, "./Results/Demo_Table_1_padjust.csv",
    row.names = FALSE)

Culture: Supplemental Table 3

# Percentage of infections # where there is an infection
# within 30 days after transplant and not 0 days before
cult_percent_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", between(eday, 0,
        30)) %>%
    group_by(patientID, sampleID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(organisms != "No Bacterial Infection") %>%
    group_by(patientID, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    group_by(patientID, infx_stool, organisms) %>%
    dplyr::slice(1) %>%
    group_by(organisms) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    replace_na(list(group = "unknown")) %>%
    group_by(organisms) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

# Location of infections # where there is an infection
# within 30 days after transplant and not 0 days before
cult_location_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", between(eday, 0,
        30)) %>%
    group_by(patientID, eday, key) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, bact_infection_present, infx_stool, organism1,
        key) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:key), names_to = "organisms", values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(patientID, key, infx_stool, bact_infection_present) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    filter(organisms != "No Bacterial Infection") %>%
    group_by(patientID, infx_stool, key) %>%
    dplyr::slice(1) %>%
    group_by(key) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    replace_na(list(group = "unknown")) %>%
    group_by(key) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

# Type of Infection # where there is an infection within 30
# days after transplant and not 0 days before
cult_type_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", grepl(variable, pattern = "anatomy.+"),
        between(eday, 0, 30)) %>%
    group_by(patientID, eday, diag_cat2) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    mutate(diag_cat3 = case_when(grepl("abdominal", diag_cat2,
        ignore.case = T) ~ "Abdominal", grepl("vir|bronchitis|covid-19|cmv",
        diag_cat2, ignore.case = T) ~ "Viral", grepl("bacteremia",
        diag_cat2, ignore.case = T) ~ "Bacteremia", grepl("thrush",
        diag_cat2, ignore.case = T) ~ "Fungal", grepl("cholangitis|empyema|panniculitis|peritonitis|latent tb",
        diag_cat2, ignore.case = T) ~ "Bacterial", grepl("cystitis|pyelonephritis",
        diag_cat2, ignore.case = T) ~ "Urinary Tract Infection",
        grepl("pneumonia", diag_cat2, ignore.case = T) ~ "Pneumonia",
        TRUE ~ diag_cat2), diag_cat3 = str_to_title(diag_cat3)) %>%
    ungroup() %>%
    select(patientID, bact_infection_present, infx_stool, organism1,
        micro1.factor, key, diag_cat3) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:diag_cat3), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(patientID, diag_cat3, infx_stool, bact_infection_present) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1, organisms != "No Bacterial Infection") %>%
    ungroup() %>%
    group_by(patientID, infx_stool, diag_cat3) %>%
    dplyr::slice(1) %>%
    group_by(diag_cat3) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    group_by(diag_cat3) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

culture_tot <- bind_rows(cult_percent_infx_all, cult_location_infx_all,
    cult_type_infx_all) %>%
    mutate(percent = round(percent, 2))

# Read in csv to then append adjusted pvalues
write.csv(culture_tot, "./Results/Supplemental_Table_3.csv",
    row.names = FALSE)

Antibiotics

## Vector of variables to summarize
abx_vars <- c("Basiliximab", "Mycophenolate", "Steroid", "Systemic Vancomycin",
    "Tacrolimus", "Cefepime", "Metronidazole", "Piperacillin/Tazobactam",
    "Rifaximin", "Ceftriaxone", "Ciprofloxacin", "Gentamicin",
    "Tobramicin", "Daptomycin", "Meropenem", "Oral Vancomycin")

abx2 <- abx %>%
    filter(grepl(pattern = "basilix|tacro|steroid|mycophenolate|lactulose",
        medication_name, ignore.case = T) | grepl(pattern = "GLUCOCORTICOIDS|steroid",
        med_pharm_class, ignore.case = T) | grepl(pattern = "steroid",
        med_pharm_sub_class, ignore.case = T) | grepl("given",
        mar_action, ignore.case = T)) %>%
    mutate(Immunosuppressants = case_when(grepl("basilix", medication_name,
        ignore.case = T) ~ "Basiliximab", grepl("tacro", medication_name,
        ignore.case = T) ~ "Tacrolimus", grepl("one|ide|solu-cortef",
        medication_name, ignore.case = T) ~ "Steroid", grepl("mycophenolate",
        medication_name, ignore.case = T) ~ "Mycophenolate",
        grepl("lactulose", medication_name, ignore.case = T) ~
            "Lactulose"), Antibiotics = case_when(grepl("rifaximin",
        medication_name, ignore.case = T) ~ "Rifaximin", grepl("lactulose",
        medication_name, ignore.case = T) ~ "Lactulose", grepl("ceftriaxone",
        medication_name, ignore.case = T) ~ "Ceftriaxone", grepl("piperacillin|tazobactam",
        medication_name, ignore.case = T) ~ "Piperacillin/Tazobactam",
        grepl("cefepime", medication_name, ignore.case = T) ~
            "Cefepime", grepl("meropenem", medication_name, ignore.case = T) ~
            "Meropenem", grepl("gentamicin", medication_name,
            ignore.case = T) ~ "Gentamicin", grepl("tobramycin",
            medication_name, ignore.case = T) ~ "Tobramicin",
        grepl("vancomycin.+oral", medication_name, ignore.case = T) ~
            "Oral Vancomycin", grepl("vancomycin.+(IV|Intravenous)",
            medication_name, ignore.case = T) ~ "Systemic Vancomycin",
        grepl("METRONIDAZOLE", medication_name, ignore.case = T) ~
            "Metronidazole", grepl("DAPTOMYCIN", medication_name,
            ignore.case = T) ~ "Daptomycin", grepl("linezolid",
            medication_name, ignore.case = T) ~ "Linezolid",
        grepl("fluconazole", medication_name, ignore.case = T) ~
            "Fluconazole", grepl("micafungin", medication_name,
            ignore.case = T) ~ "Micafungin", grepl("cipro", medication_name,
            ignore.case = T) & !grepl("drop", dose_units, ignore.case = T) ~
            "Ciprofloxacin"), action = case_when(!is.na(Immunosuppressants) &
        between(days_transplant, 0, 30) | !is.na(Immunosuppressants) &
        ordering_mode == "Outpatient" ~ "keep", !is.na(Antibiotics) &
        between(days_transplant, -7, 1) ~ "keep", TRUE ~ "remove")) %>%
    group_by(patientID, Immunosuppressants, Antibiotics) %>%
    arrange(days_transplant) %>%
    filter(action == "keep") %>%
    dplyr::slice(1) %>%
    select(patientID, Immunosuppressants, Antibiotics) %>%
    left_join(peri_matrix_all %>%
        select(patientID, any_infection)) %>%
    pivot_longer(!c(patientID, any_infection), names_to = "variable",
        values_to = "value") %>%
    drop_na(value) %>%
    mutate(variable = 1) %>%
    pivot_wider(c(patientID, any_infection), names_from = "value",
        values_from = "variable", values_fn = min) %>%
    replace(is.na(.), 0)

abx_tab1_1 <- CreateTableOne(vars = abx_vars, testNonNormal = "wilcox.test",
    includeNA = FALSE, factorVars = abx_vars, strata = "any_infection",
    data = abx2)

summary(abx_tab1_1)
## 
##      ### Summary of categorical variables ### 
## 
## any_infection: 0
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 82    0    0.0     0   28    34.1        34.1
##                                             1   54    65.9       100.0
##                                                                       
##            Mycophenolate 82    0    0.0     0   10    12.2        12.2
##                                             1   72    87.8       100.0
##                                                                       
##                  Steroid 82    0    0.0     1   82   100.0       100.0
##                                                                       
##      Systemic Vancomycin 82    0    0.0     0   40    48.8        48.8
##                                             1   42    51.2       100.0
##                                                                       
##               Tacrolimus 82    0    0.0     1   82   100.0       100.0
##                                                                       
##                 Cefepime 82    0    0.0     0   56    68.3        68.3
##                                             1   26    31.7       100.0
##                                                                       
##            Metronidazole 82    0    0.0     0   48    58.5        58.5
##                                             1   34    41.5       100.0
##                                                                       
##  Piperacillin/Tazobactam 82    0    0.0     0   12    14.6        14.6
##                                             1   70    85.4       100.0
##                                                                       
##                Rifaximin 82    0    0.0     0   45    54.9        54.9
##                                             1   37    45.1       100.0
##                                                                       
##              Ceftriaxone 82    0    0.0     0   66    80.5        80.5
##                                             1   16    19.5       100.0
##                                                                       
##            Ciprofloxacin 82    0    0.0     0   67    81.7        81.7
##                                             1   15    18.3       100.0
##                                                                       
##               Gentamicin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
##               Tobramicin 82    0    0.0     0   78    95.1        95.1
##                                             1    4     4.9       100.0
##                                                                       
##               Daptomycin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
##                Meropenem 82    0    0.0     0   78    95.1        95.1
##                                             1    4     4.9       100.0
##                                                                       
##          Oral Vancomycin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
## ------------------------------------------------------------ 
## any_infection: 1
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 25    0    0.0     0    7    28.0        28.0
##                                             1   18    72.0       100.0
##                                                                       
##            Mycophenolate 25    0    0.0     0    1     4.0         4.0
##                                             1   24    96.0       100.0
##                                                                       
##                  Steroid 25    0    0.0     1   25   100.0       100.0
##                                                                       
##      Systemic Vancomycin 25    0    0.0     0    8    32.0        32.0
##                                             1   17    68.0       100.0
##                                                                       
##               Tacrolimus 25    0    0.0     1   25   100.0       100.0
##                                                                       
##                 Cefepime 25    0    0.0     0   14    56.0        56.0
##                                             1   11    44.0       100.0
##                                                                       
##            Metronidazole 25    0    0.0     0   13    52.0        52.0
##                                             1   12    48.0       100.0
##                                                                       
##  Piperacillin/Tazobactam 25    0    0.0     0    2     8.0         8.0
##                                             1   23    92.0       100.0
##                                                                       
##                Rifaximin 25    0    0.0     0   13    52.0        52.0
##                                             1   12    48.0       100.0
##                                                                       
##              Ceftriaxone 25    0    0.0     0   18    72.0        72.0
##                                             1    7    28.0       100.0
##                                                                       
##            Ciprofloxacin 25    0    0.0     0   21    84.0        84.0
##                                             1    4    16.0       100.0
##                                                                       
##               Gentamicin 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
##               Tobramicin 25    0    0.0     0   22    88.0        88.0
##                                             1    3    12.0       100.0
##                                                                       
##               Daptomycin 25    0    0.0     0   22    88.0        88.0
##                                             1    3    12.0       100.0
##                                                                       
##                Meropenem 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
##          Oral Vancomycin 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
## 
## p-values
##                            pApprox     pExact
## Basiliximab             0.74143512 0.63341955
## Mycophenolate           0.42082759 0.45169519
## Steroid                         NA         NA
## Systemic Vancomycin     0.21234695 0.17136371
## Tacrolimus                      NA         NA
## Cefepime                0.37287647 0.33702201
## Metronidazole           0.72844870 0.64658181
## Piperacillin/Tazobactam 0.60142514 0.51269562
## Rifaximin               0.98119534 0.82233980
## Ceftriaxone             0.53110302 0.40817712
## Ciprofloxacin           1.00000000 1.00000000
## Gentamicin              0.95599591 0.41438900
## Tobramicin              0.42443880 0.35037337
## Daptomycin              0.05938894 0.03899733
## Meropenem               1.00000000 1.00000000
## Oral Vancomycin         0.95599591 0.41438900
## 
## Standardize mean differences
##                             1 vs 2
## Basiliximab             0.13310348
## Mycophenolate           0.30385759
## Steroid                 0.00000000
## Systemic Vancomycin     0.34709743
## Tacrolimus              0.00000000
## Cefepime                0.25550557
## Metronidazole           0.13174842
## Piperacillin/Tazobactam 0.21056770
## Rifaximin               0.05772164
## Ceftriaxone             0.20043582
## Ciprofloxacin           0.06085600
## Gentamicin              0.17507370
## Tobramicin              0.25833951
## Daptomycin              0.44449214
## Meropenem               0.04264158
## Oral Vancomycin         0.17507370
abx_tab1_2 <- print(abx_tab1_1, formatOptions = list(big.mark = ","))
##                                  Stratified by any_infection
##                                   0           1           p      test
##   n                               82          25                     
##   Basiliximab = 1 (%)             54 ( 65.9)  18 ( 72.0)   0.741     
##   Mycophenolate = 1 (%)           72 ( 87.8)  24 ( 96.0)   0.421     
##   Steroid = 1 (%)                 82 (100.0)  25 (100.0)      NA     
##   Systemic Vancomycin = 1 (%)     42 ( 51.2)  17 ( 68.0)   0.212     
##   Tacrolimus = 1 (%)              82 (100.0)  25 (100.0)      NA     
##   Cefepime = 1 (%)                26 ( 31.7)  11 ( 44.0)   0.373     
##   Metronidazole = 1 (%)           34 ( 41.5)  12 ( 48.0)   0.728     
##   Piperacillin/Tazobactam = 1 (%) 70 ( 85.4)  23 ( 92.0)   0.601     
##   Rifaximin = 1 (%)               37 ( 45.1)  12 ( 48.0)   0.981     
##   Ceftriaxone = 1 (%)             16 ( 19.5)   7 ( 28.0)   0.531     
##   Ciprofloxacin = 1 (%)           15 ( 18.3)   4 ( 16.0)   1.000     
##   Gentamicin = 1 (%)               1 (  1.2)   1 (  4.0)   0.956     
##   Tobramicin = 1 (%)               4 (  4.9)   3 ( 12.0)   0.424     
##   Daptomycin = 1 (%)               1 (  1.2)   3 ( 12.0)   0.059     
##   Meropenem = 1 (%)                4 (  4.9)   1 (  4.0)   1.000     
##   Oral Vancomycin = 1 (%)          1 (  1.2)   1 (  4.0)   0.956
write.csv(abx_tab1_2, "./Results/ABX_Table_1.csv", row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
abx_tab1_2_padjust1 <- read.csv("./Results/ABX_Table_1.csv") %>%
    dplyr::rename(` ` = X, `No Infection` = X0, `Bacterial Infection` = X1)

abx_tab1_2_padjust2 <- abx_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = abx_tab1_2_padjust1$` `))

abx_tab1_2_padjust3 <- abx_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & is.na(test), "chi.sq", "")) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, abx_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(abx_tab1_2_padjust3, "./Results/ABX_Table_1_padjust.csv",
    row.names = FALSE)

Supplemental Figure 1: Fecal Sample Collected from Transplant

sample_days <- metaphlan_df_sumry %>%
    group_by(sampleID) %>%
    slice(1) %>%
    filter(db == "Liver Transplant") %>%
    select(sampleID, diversity_group) %>%
    left_join(sample_lookup %>%
        select(sampleID, sample_days_from_transplant)) %>%
    ungroup()


# Quartiles of samples
clin_quartiles <- sample_days %>%
    select(sample_days_from_transplant) %>%
    summarise(minimum = min(sample_days_from_transplant, na.rm = T),
        lower = unname(quantile(sample_days_from_transplant,
            probs = 0.25, na.rm = T)), median = unname(quantile(sample_days_from_transplant,
            probs = 0.5, na.rm = T)), upper = unname(quantile(sample_days_from_transplant,
            probs = 0.75, na.rm = T)), maximum = max(sample_days_from_transplant,
            na.rm = T)) %>%
    mutate(placeholder = 1) %>%
    pivot_longer(!placeholder, names_to = "ranges", values_to = "value")



sample_days_histogram <- sample_days %>%
    ggplot(., aes(x = sample_days_from_transplant)) + geom_histogram(color = "black",
    fill = "grey", binwidth = 1) + geom_vline(aes(xintercept = 0),
    color = "orangered", linetype = "solid", size = 1.2) + geom_vline(aes(xintercept = median(sample_days_from_transplant,
    na.rm = TRUE)), color = "cyan2", linetype = "dashed", size = 1.2) +
    geom_text(aes(x = median(sample_days_from_transplant, na.rm = TRUE),
        y = 12.5), color = "cyan2", label = paste0("Median = ",
        clin_quartiles %>%
            filter(ranges == "median") %>%
            select(value), " days"), nudge_x = 4) + geom_text(label = "Transplant Date = Day 0",
    x = -4, y = 12.5, color = "orangered") + theme_bw() + theme(panel.grid = eb(),
    axis.text = et(color = "black", size = 12), axis.title = et(color = "black",
        size = 14), legend.position = "none") + scale_x_continuous(breaks = seq(-7,
    30, 5), limits = c(-7, 35)) + ylab("Number of Samples\n")


sample_days_boxplot <- sample_days %>%
    ggplot(., aes(x = sample_days_from_transplant, y = diversity_group,
        fill = diversity_group)) + geom_vline(aes(xintercept = 0),
    color = "orangered", linetype = "solid", size = 1.2) + geom_vline(aes(xintercept = median(sample_days_from_transplant,
    na.rm = TRUE)), color = "cyan2", linetype = "dashed", size = 1.2) +
    geom_violin() + geom_boxplot(alpha = 0.5, width = 0.15, outlier.shape = NA,
    fill = "white", color = "white") + stat_compare_means(comparisons = list(c("High Diversity",
    "Medium Diversity"), c("High Diversity", "Low Diversity"),
    c("Medium Diversity", "Low Diversity")), bracket.size = 0.3,
    step.increase = 0.07) + theme_bw() + theme(panel.grid = eb(),
    axis.text.x = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 12), axis.ticks.y = eb(), axis.title = et(color = "black",
        size = 14), legend.position = "none") + scale_x_continuous(breaks = seq(-7,
    30, 5), limits = c(-7, 35)) + scale_fill_manual(values = c("#3A001E",
    "#8A0246", "#C20463")) + ylab("Diversity Groups\n") + xlab("\nDays to Sample")

pdf(file = "./Results/Supplemental_Figure_1.pdf", height = 10,
    width = 12, onefile = F)

gg.stack(sample_days_histogram, sample_days_boxplot)

dev.off()
## quartz_off_screen 
##                 2
gg.stack(sample_days_histogram, sample_days_boxplot)

Miscellaneous Statistics

# Percent of samples with > 90% of a single taxon
metaphlan_df2 %>%
    left_join(metaphlan_df_sumry %>%
        select(sampleID, diversity_group)) %>%
    filter(grepl(x = sampleID, pattern = "^lt")) %>%
    group_by(sampleID) %>%
    filter(pctseqs > 0.9) %>%
    select(-y.text, -db) %>%
    write.csv(., "./Results/90percent_taxon.csv", row.names = FALSE)

Save Data Image

save.image(file = "./Data/LT_Modeling.RData")